Skip to content

Commit

Permalink
Multiplexing token filter (#31208)
Browse files Browse the repository at this point in the history
The `multiplexer` filter emits multiple tokens at the same position, each 
version of the token haivng been passed through a different filter chain.
Identical tokens at the same position are removed.

This allows users to, for example, index lowercase and original-case tokens,
or stemmed and unstemmed versions, in the same field, so that they can search
for a stemmed term within x positions of an unstemmed term.
  • Loading branch information
romseygeek authored Jun 20, 2018
1 parent df10704 commit 5683bc6
Show file tree
Hide file tree
Showing 8 changed files with 469 additions and 2 deletions.
2 changes: 2 additions & 0 deletions docs/reference/analysis/tokenfilters.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]

include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]

include::tokenfilters/multiplexer-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
Expand Down
116 changes: 116 additions & 0 deletions docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
[[analysis-multiplexer-tokenfilter]]
=== Multiplexer Token Filter

A token filter of type `multiplexer` will emit multiple tokens at the same position,
each version of the token having been run through a different filter. Identical
output tokens at the same position will be removed.

WARNING: If the incoming token stream has duplicate tokens, then these will also be
removed by the multiplexer

[float]
=== Options
[horizontal]
filters:: a list of token filters to apply to incoming tokens. These can be any
token filters defined elsewhere in the index mappings. Filters can be chained
using a comma-delimited string, so for example `"lowercase, porter_stem"` would
apply the `lowercase` filter and then the `porter_stem` filter to a single token.

WARNING: Shingle or multi-word synonym token filters will not function normally
when they are declared in the filters array because they read ahead internally
which is unsupported by the multiplexer

preserve_original:: if `true` (the default) then emit the original token in
addition to the filtered tokens


[float]
=== Settings example

You can set it up like:

[source,js]
--------------------------------------------------
PUT /multiplexer_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : [ "my_multiplexer" ]
}
},
"filter" : {
"my_multiplexer" : {
"type" : "multiplexer",
"filters" : [ "lowercase", "lowercase, porter_stem" ]
}
}
}
}
}
--------------------------------------------------
// CONSOLE

And test it like:

[source,js]
--------------------------------------------------
POST /multiplexer_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "Going HOME"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]

And it'd respond:

[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "Going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "go",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "HOME",
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "home", <1>
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
}
]
}
--------------------------------------------------
// TESTRESPONSE

<1> The stemmer has also emitted a token `home` at position 1, but because it is a
duplicate of this token it has been removed from the token stream
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("limit", LimitTokenCountFilterFactory::new);
filters.put("lowercase", LowerCaseTokenFilterFactory::new);
filters.put("min_hash", MinHashTokenFilterFactory::new);
filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("nGram", NGramTokenFilterFactory::new);
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {

private List<TokenFilterFactory> filters;
private List<String> filterNames;
private final boolean preserveOriginal;

private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
@Override
public String name() {
return "identity";
}

@Override
public TokenStream create(TokenStream tokenStream) {
return tokenStream;
}
};

public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
this.filterNames = settings.getAsList("filters");
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
}

@Override
public TokenStream create(TokenStream tokenStream) {
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
for (TokenFilterFactory tff : filters) {
functions.add(tff::create);
}
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
}

@Override
public void setReferences(Map<String, TokenFilterFactory> factories) {
filters = new ArrayList<>();
if (preserveOriginal) {
filters.add(IDENTITY_FACTORY);
}
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
filters.add(resolveFilterFactory(factories, parts[0]));
} else {
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
chain.add(resolveFilterFactory(factories, subfilter));
}
filters.add(chainFilters(filter, chain));
}
}
}

private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
return new TokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
for (TokenFilterFactory tff : filters) {
tokenStream = tff.create(tokenStream);
}
return tokenStream;
}
};
}

private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
if (factories.containsKey(name) == false) {
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
} else {
return factories.get(name);
}
}

private final class MultiplexTokenFilter extends TokenFilter {

private final TokenStream source;
private final int filterCount;

private int selector;

/**
* Creates a MultiplexTokenFilter on the given input with a set of filters
*/
MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
super(input);
TokenStream source = new MultiplexerFilter(input);
for (int i = 0; i < filters.size(); i++) {
final int slot = i;
source = new ConditionalTokenFilter(source, filters.get(i)) {
@Override
protected boolean shouldFilter() {
return slot == selector;
}
};
}
this.source = source;
this.filterCount = filters.size();
this.selector = filterCount - 1;
}

@Override
public boolean incrementToken() throws IOException {
return source.incrementToken();
}

@Override
public void end() throws IOException {
source.end();
}

@Override
public void reset() throws IOException {
source.reset();
}

private final class MultiplexerFilter extends TokenFilter {

State state;
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

private MultiplexerFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (selector >= filterCount - 1) {
selector = 0;
if (input.incrementToken() == false) {
return false;
}
state = captureState();
return true;
}
restoreState(state);
posIncAtt.setPositionIncrement(0);
selector++;
return true;
}

@Override
public void reset() throws IOException {
super.reset();
selector = filterCount - 1;
this.state = null;
}
}

}
}
Loading

0 comments on commit 5683bc6

Please sign in to comment.