Multiplexing token filter (#31208)

The `multiplexer` filter emits multiple tokens at the same position, each version of the token haivng been passed through a different filter chain. Identical tokens at the same position are removed. This allows users to, for example, index lowercase and original-case tokens, or stemmed and unstemmed versions, in the same field, so that they can search for a stemmed term within x positions of an unstemmed term.
elastic · Jun 20, 2018 · 5683bc6 · 5683bc6
1 parent df10704
commit 5683bc6
Show file tree

Hide file tree

Showing 8 changed files with 469 additions and 2 deletions.
diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc
@@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]
 
 include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]
 
+include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
+
 include::tokenfilters/stemmer-tokenfilter.asciidoc[]
 
 include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]

diff --git a/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
@@ -0,0 +1,116 @@
+[[analysis-multiplexer-tokenfilter]]
+=== Multiplexer Token Filter
+
+A token filter of type `multiplexer` will emit multiple tokens at the same position,
+each version of the token having been run through a different filter.  Identical
+output tokens at the same position will be removed.
+
+WARNING: If the incoming token stream has duplicate tokens, then these will also be
+removed by the multiplexer
+
+[float]
+=== Options
+[horizontal]
+filters:: a list of token filters to apply to incoming tokens.  These can be any
+  token filters defined elsewhere in the index mappings.  Filters can be chained
+  using a comma-delimited string, so for example `"lowercase, porter_stem"` would
+  apply the `lowercase` filter and then the `porter_stem` filter to a single token.
+
+WARNING: Shingle or multi-word synonym token filters will not function normally
+  when they are declared in the filters array because they read ahead internally
+  which is unsupported by the multiplexer
+
+preserve_original:: if `true` (the default) then emit the original token in
+  addition to the filtered tokens
+
+
+[float]
+=== Settings example
+
+You can set it up like:
+
+[source,js]
+--------------------------------------------------
+PUT /multiplexer_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "my_multiplexer" ]
+                }
+            },
+            "filter" : {
+                "my_multiplexer" : {
+                    "type" : "multiplexer",
+                    "filters" : [ "lowercase", "lowercase, porter_stem" ]
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// CONSOLE
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /multiplexer_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "Going HOME"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "Going",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "going",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "go",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "HOME",
+      "start_offset": 6,
+      "end_offset": 10,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "home",          <1>
+      "start_offset": 6,
+      "end_offset": 10,
+      "type": "<ALPHANUM>",
+      "position": 1
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+<1> The stemmer has also emitted a token `home` at position 1, but because it is a
+duplicate of this token it has been removed from the token stream
diff --git a/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -226,6 +226,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         filters.put("limit", LimitTokenCountFilterFactory::new);
         filters.put("lowercase", LowerCaseTokenFilterFactory::new);
         filters.put("min_hash", MinHashTokenFilterFactory::new);
+        filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
         filters.put("ngram", NGramTokenFilterFactory::new);
         filters.put("nGram", NGramTokenFilterFactory::new);
         filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));

diff --git a/...common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java b/...common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
+import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.ReferringFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
+
+    private List<TokenFilterFactory> filters;
+    private List<String> filterNames;
+    private final boolean preserveOriginal;
+
+    private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
+        @Override
+        public String name() {
+            return "identity";
+        }
+
+        @Override
+        public TokenStream create(TokenStream tokenStream) {
+            return tokenStream;
+        }
+    };
+
+    public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
+        super(indexSettings, name, settings);
+        this.filterNames = settings.getAsList("filters");
+        this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
+        for (TokenFilterFactory tff : filters) {
+            functions.add(tff::create);
+        }
+        return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
+    }
+
+    @Override
+    public void setReferences(Map<String, TokenFilterFactory> factories) {
+        filters = new ArrayList<>();
+        if (preserveOriginal) {
+            filters.add(IDENTITY_FACTORY);
+        }
+        for (String filter : filterNames) {
+            String[] parts = Strings.tokenizeToStringArray(filter, ",");
+            if (parts.length == 1) {
+                filters.add(resolveFilterFactory(factories, parts[0]));
+            } else {
+                List<TokenFilterFactory> chain = new ArrayList<>();
+                for (String subfilter : parts) {
+                    chain.add(resolveFilterFactory(factories, subfilter));
+                }
+                filters.add(chainFilters(filter, chain));
+            }
+        }
+    }
+
+    private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return name;
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                for (TokenFilterFactory tff : filters) {
+                    tokenStream = tff.create(tokenStream);
+                }
+                return tokenStream;
+            }
+        };
+    }
+
+    private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
+        if (factories.containsKey(name) == false) {
+            throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
+        } else {
+            return factories.get(name);
+        }
+    }
+
+    private final class MultiplexTokenFilter extends TokenFilter {
+
+        private final TokenStream source;
+        private final int filterCount;
+
+        private int selector;
+
+        /**
+         * Creates a MultiplexTokenFilter on the given input with a set of filters
+         */
+        MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
+            super(input);
+            TokenStream source = new MultiplexerFilter(input);
+            for (int i = 0; i < filters.size(); i++) {
+                final int slot = i;
+                source = new ConditionalTokenFilter(source, filters.get(i)) {
+                    @Override
+                    protected boolean shouldFilter() {
+                        return slot == selector;
+                    }
+                };
+            }
+            this.source = source;
+            this.filterCount = filters.size();
+            this.selector = filterCount - 1;
+        }
+
+        @Override
+        public boolean incrementToken() throws IOException {
+            return source.incrementToken();
+        }
+
+        @Override
+        public void end() throws IOException {
+            source.end();
+        }
+
+        @Override
+        public void reset() throws IOException {
+            source.reset();
+        }
+
+        private final class MultiplexerFilter extends TokenFilter {
+
+            State state;
+            PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+            private MultiplexerFilter(TokenStream input) {
+                super(input);
+            }
+
+            @Override
+            public boolean incrementToken() throws IOException {
+                if (selector >= filterCount - 1) {
+                    selector = 0;
+                    if (input.incrementToken() == false) {
+                        return false;
+                    }
+                    state = captureState();
+                    return true;
+                }
+                restoreState(state);
+                posIncAtt.setPositionIncrement(0);
+                selector++;
+                return true;
+            }
+
+            @Override
+            public void reset() throws IOException {
+                super.reset();
+                selector = filterCount - 1;
+                this.state = null;
+            }
+        }
+
+    }
+}