Skip to content

Commit

Permalink
Add analysis modes to restrict token filter use contexts (#36103)
Browse files Browse the repository at this point in the history
Currently token filter settings are treated as fixed once they are declared and
used in an analyzer. This is done to prevent changes in analyzers that are already
used actively to index documents, since changes to the analysis chain could
corrupt the index. However, it would be safe to allow updates to token
filters at search time ("search_analyzer"). This change introduces a new
property of token filters that allows to mark them as only being usable at search
or at index time. Any analyzer that uses these tokenfilters inherits that property
and can be rejected if they are used in other contexts. This is a first step towards
making specific token filters (e.g. synonym filter) updateable.

Relates to #29051
  • Loading branch information
Christoph Büscher authored Mar 12, 2019
1 parent 6c6c44e commit ef18d3f
Show file tree
Hide file tree
Showing 9 changed files with 445 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

/**
* Enum representing the mode in which token filters and analyzers are allowed to operate.
* While most token filters are allowed both in index and search time analyzers, some are
* restricted to be used only at index time, others at search time.
*/
public enum AnalysisMode {

/**
* AnalysisMode representing analysis components that can be used only at index time
*/
INDEX_TIME("index time") {
@Override
public AnalysisMode merge(AnalysisMode other) {
if (other == AnalysisMode.SEARCH_TIME) {
throw new IllegalStateException("Cannot merge SEARCH_TIME and INDEX_TIME analysis mode.");
}
return AnalysisMode.INDEX_TIME;
}
},
/**
* AnalysisMode representing analysis components that can be used only at search time
*/
SEARCH_TIME("search time") {
@Override
public AnalysisMode merge(AnalysisMode other) {
if (other == AnalysisMode.INDEX_TIME) {
throw new IllegalStateException("Cannot merge SEARCH_TIME and INDEX_TIME analysis mode.");
}
return AnalysisMode.SEARCH_TIME;
}
},
/**
* AnalysisMode representing analysis components that can be used both at index and search time
*/
ALL("all") {
@Override
public AnalysisMode merge(AnalysisMode other) {
return other;
}
};

private String readableName;

AnalysisMode(String name) {
this.readableName = name;
}

public String getReadableName() {
return this.readableName;
}

/**
* Returns a mode that is compatible with both this mode and the other mode, that is:
* <ul>
* <li>ALL.merge(INDEX_TIME) == INDEX_TIME</li>
* <li>ALL.merge(SEARCH_TIME) == SEARCH_TIME</li>
* <li>INDEX_TIME.merge(SEARCH_TIME) throws an {@link IllegalStateException}</li>
* </ul>
*/
abstract AnalysisMode merge(AnalysisMode other);
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
Expand Down Expand Up @@ -456,6 +456,7 @@ public IndexAnalyzers build(IndexSettings indexSettings,
if (defaultAnalyzer == null) {
throw new IllegalArgumentException("no default analyzer configured");
}
defaultAnalyzer.checkAllowedInMode(AnalysisMode.ALL);
if (analyzers.containsKey("default_index")) {
throw new IllegalArgumentException("setting [index.analysis.analyzer.default_index] is not supported anymore, use " +
"[index.analysis.analyzer.default] instead for index [" + index.getName() + "]");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public final class CustomAnalyzer extends Analyzer {

private final int positionIncrementGap;
private final int offsetGap;
private final AnalysisMode analysisMode;

public CustomAnalyzer(String tokenizerName, TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
TokenFilterFactory[] tokenFilters) {
Expand All @@ -50,6 +51,12 @@ public CustomAnalyzer(String tokenizerName, TokenizerFactory tokenizerFactory, C
this.tokenFilters = tokenFilters;
this.positionIncrementGap = positionIncrementGap;
this.offsetGap = offsetGap;
// merge and transfer token filter analysis modes with analyzer
AnalysisMode mode = AnalysisMode.ALL;
for (TokenFilterFactory f : tokenFilters) {
mode = mode.merge(f.getAnalysisMode());
}
this.analysisMode = mode;
}

/**
Expand Down Expand Up @@ -84,6 +91,10 @@ public int getOffsetGap(String field) {
return this.offsetGap;
}

public AnalysisMode getAnalysisMode() {
return this.analysisMode;
}

@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = tokenizerFactory.create();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
import org.elasticsearch.index.mapper.MapperException;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

/**
Expand All @@ -34,6 +37,7 @@ public class NamedAnalyzer extends DelegatingAnalyzerWrapper {
private final AnalyzerScope scope;
private final Analyzer analyzer;
private final int positionIncrementGap;
private final AnalysisMode analysisMode;

public NamedAnalyzer(NamedAnalyzer analyzer, int positionIncrementGap) {
this(analyzer.name(), analyzer.scope(), analyzer.analyzer(), positionIncrementGap);
Expand All @@ -43,12 +47,17 @@ public NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer) {
this(name, scope, analyzer, Integer.MIN_VALUE);
}

public NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer, int positionIncrementGap) {
NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer, int positionIncrementGap) {
super(ERROR_STRATEGY);
this.name = name;
this.scope = scope;
this.analyzer = analyzer;
this.positionIncrementGap = positionIncrementGap;
if (analyzer instanceof org.elasticsearch.index.analysis.CustomAnalyzer) {
this.analysisMode = ((org.elasticsearch.index.analysis.CustomAnalyzer) analyzer).getAnalysisMode();
} else {
this.analysisMode = AnalysisMode.ALL;
}
}

/**
Expand All @@ -65,6 +74,13 @@ public AnalyzerScope scope() {
return this.scope;
}

/**
* Returns whether this analyzer can be updated
*/
public AnalysisMode getAnalysisMode() {
return this.analysisMode;
}

/**
* The actual analyzer.
*/
Expand All @@ -85,9 +101,37 @@ public int getPositionIncrementGap(String fieldName) {
return super.getPositionIncrementGap(fieldName);
}

/**
* Checks the wrapped analyzer for the provided restricted {@link AnalysisMode} and throws
* an error if the analyzer is not allowed to run in that mode. The error contains more detailed information about
* the offending filters that caused the analyzer to not be allowed in this mode.
*/
public void checkAllowedInMode(AnalysisMode mode) {
Objects.requireNonNull(mode);
if (this.getAnalysisMode() == AnalysisMode.ALL) {
return; // everything allowed if this analyzer is in ALL mode
}
if (this.getAnalysisMode() != mode) {
if (analyzer instanceof CustomAnalyzer) {
TokenFilterFactory[] tokenFilters = ((CustomAnalyzer) analyzer).tokenFilters();
List<String> offendingFilters = new ArrayList<>();
for (TokenFilterFactory tokenFilter : tokenFilters) {
if (tokenFilter.getAnalysisMode() != mode) {
offendingFilters.add(tokenFilter.name());
}
}
throw new MapperException("analyzer [" + name + "] contains filters " + offendingFilters
+ " that are not allowed to run in " + mode.getReadableName() + " mode.");
} else {
throw new MapperException(
"analyzer [" + name + "] contains components that are not allowed to run in " + mode.getReadableName() + " mode.");
}
}
}

@Override
public String toString() {
return "analyzer name[" + name + "], analyzer [" + analyzer + "]";
return "analyzer name[" + name + "], analyzer [" + analyzer + "], analysisMode [" + analysisMode + "]";
}

/** It is an error if this is ever used, it means we screwed up! */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ default TokenFilterFactory getSynonymFilter() {
return this;
}

/**
* Get the {@link AnalysisMode} this filter is allowed to be used in. The default is
* {@link AnalysisMode#ALL}. Instances need to override this method to define their
* own restrictions.
*/
default AnalysisMode getAnalysisMode() {
return AnalysisMode.ALL;
}

/**
* A TokenFilterFactory that does no filtering to its TokenStream
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.time.DateFormatter;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.analysis.AnalysisMode;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.similarity.SimilarityProvider;

Expand Down Expand Up @@ -80,18 +81,37 @@ private static void parseAnalyzersAndTermVectors(FieldMapper.Builder builder, St
if (analyzer == null) {
throw new MapperParsingException("analyzer [" + propNode.toString() + "] not found for field [" + name + "]");
}
analyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
searchAnalyzer = analyzer;
iterator.remove();
} else if (propName.equals("search_quote_analyzer")) {
NamedAnalyzer analyzer = parserContext.getIndexAnalyzers().get(propNode.toString());
if (analyzer == null) {
throw new MapperParsingException("analyzer [" + propNode.toString() + "] not found for field [" + name + "]");
}
analyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
searchQuoteAnalyzer = analyzer;
iterator.remove();
}
}

// check analyzers are allowed to work in the respective AnalysisMode
{
if (indexAnalyzer != null) {
if (searchAnalyzer == null) {
indexAnalyzer.checkAllowedInMode(AnalysisMode.ALL);
} else {
indexAnalyzer.checkAllowedInMode(AnalysisMode.INDEX_TIME);
}
}
if (searchAnalyzer != null) {
searchAnalyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
}
if (searchQuoteAnalyzer != null) {
searchQuoteAnalyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
}
}

if (indexAnalyzer == null && searchAnalyzer != null) {
throw new MapperParsingException("analyzer on field [" + name + "] must be set when search_analyzer is set");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
package org.elasticsearch.index.analysis;

import com.carrotsearch.randomizedtesting.generators.RandomPicks;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
Expand All @@ -31,6 +33,7 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.TestEnvironment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.MapperException;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
Expand Down Expand Up @@ -102,6 +105,29 @@ public void testOverrideDefaultAnalyzer() throws IOException {
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
}

public void testOverrideDefaultAnalyzerWithoutAnalysisModeAll() throws IOException {
Version version = VersionUtils.randomVersion(random());
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
TokenFilterFactory tokenFilter = new AbstractTokenFilterFactory(IndexSettingsModule.newIndexSettings("index", settings),
"my_filter", Settings.EMPTY) {
@Override
public AnalysisMode getAnalysisMode() {
return randomFrom(AnalysisMode.SEARCH_TIME, AnalysisMode.INDEX_TIME);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return null;
}
};
Analyzer analyzer = new CustomAnalyzer("tokenizerName", null, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
MapperException ex = expectThrows(MapperException.class,
() -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings),
singletonMap("default", new PreBuiltAnalyzerProvider("my_analyzer", AnalyzerScope.INDEX, analyzer)), emptyMap(),
emptyMap(), emptyMap(), emptyMap()));
assertEquals("analyzer [my_analyzer] contains filters [my_filter] that are not allowed to run in all mode.", ex.getMessage());
}

public void testOverrideDefaultIndexAnalyzerIsUnsupported() {
Version version = VersionUtils.randomVersionBetween(random(), Version.V_6_0_0_alpha1, Version.CURRENT);
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
Expand Down
Loading

0 comments on commit ef18d3f

Please sign in to comment.