Skip to content

Commit

Permalink
Improve similarity integration. (#29187)
Browse files Browse the repository at this point in the history
This improves the way similarities are plugged in in order to:
 - reject the classic similarity on 7.x indices and emit a deprecation
   warning otherwise
 - reject unkwown parameters on 7.x indices and emit a deprecation
   warning otherwise

Even though this breaks the plugin API, I'd like to backport to 7.x so
that users can get deprecation warnings when they are doing something
that will become unsupported in the future.

Closes #23208
Closes #29035
  • Loading branch information
jpountz authored Apr 3, 2018
1 parent 8cdd950 commit 569d0c0
Show file tree
Hide file tree
Showing 26 changed files with 513 additions and 813 deletions.
18 changes: 2 additions & 16 deletions docs/reference/index-modules/similarity.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,6 @@ This similarity has the following options:

Type name: `BM25`

[float]
[[classic-similarity]]
==== Classic similarity

The classic similarity that is based on the TF/IDF model. This
similarity has the following option:

`discount_overlaps`::
Determines whether overlap tokens (Tokens with
0 position increment) are ignored when computing norm. By default this
is true, meaning overlap tokens do not count when computing norms.

Type name: `classic`

[float]
[[dfr]]
==== DFR similarity
Expand Down Expand Up @@ -541,7 +527,7 @@ PUT /index
"index": {
"similarity": {
"default": {
"type": "classic"
"type": "boolean"
}
}
}
Expand All @@ -563,7 +549,7 @@ PUT /index/_settings
"index": {
"similarity": {
"default": {
"type": "classic"
"type": "boolean"
}
}
}
Expand Down
9 changes: 2 additions & 7 deletions docs/reference/mapping/params/similarity.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,9 @@ PUT my_index
"default_field": { <1>
"type": "text"
},
"classic_field": {
"type": "text",
"similarity": "classic" <2>
},
"boolean_sim_field": {
"type": "text",
"similarity": "boolean" <3>
"similarity": "boolean" <2>
}
}
}
Expand All @@ -59,5 +55,4 @@ PUT my_index
--------------------------------------------------
// CONSOLE
<1> The `default_field` uses the `BM25` similarity.
<2> The `classic_field` uses the `classic` similarity (ie TF/IDF).
<3> The `boolean_sim_field` uses the `boolean` similarity.
<2> The `boolean_sim_field` uses the `boolean` similarity.
13 changes: 13 additions & 0 deletions docs/reference/migration/migrate_7_0/mappings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,16 @@ the index setting `index.mapping.nested_objects.limit`.
==== The `update_all_types` option has been removed

This option is useless now that all indices have at most one type.

=== The `classic` similarity has been removed

The `classic` similarity relied on coordination factors for scoring to be good
in presence of stopwords in the query. This feature has been removed from
Lucene, which means that the `classic` similarity now produces scores of lower
quality. It is advised to switch to `BM25` instead, which is widely accepted
as a better alternative.

=== Similarities fail when unsupported options are provided

An error will now be thrown when unknown configuration options are provided
to similarities. Such unknown parameters were ignored before.
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,7 @@ public void testNonDefaultSimilarity() throws Exception {
hasChildQuery(CHILD_DOC, new TermQueryBuilder("custom_string", "value"), ScoreMode.None);
HasChildQueryBuilder.LateParsingQuery query = (HasChildQueryBuilder.LateParsingQuery) hasChildQueryBuilder.toQuery(shardContext);
Similarity expected = SimilarityService.BUILT_IN.get(similarity)
.create(similarity, Settings.EMPTY,
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(), null)
.get();
.apply(Settings.EMPTY, Version.CURRENT, null);
assertThat(((PerFieldSimilarityWrapper) query.getSimilarity()).get("custom_string"), instanceOf(expected.getClass()));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ protected Collection<Class<? extends Plugin>> getPlugins() {

@Override
protected void initializeAdditionalMappings(MapperService mapperService) throws IOException {
similarity = randomFrom("classic", "BM25");
similarity = randomFrom("boolean", "BM25");
// TODO: use a single type when inner hits have been changed to work with join field,
// this test randomly generates queries with inner hits
mapperService.merge(PARENT_TYPE, new CompressedXContent(Strings.toString(PutMappingRequest.buildFromSimplifiedDef(PARENT_TYPE,
Expand Down Expand Up @@ -323,9 +323,7 @@ public void testNonDefaultSimilarity() throws Exception {
hasChildQuery(CHILD_TYPE, new TermQueryBuilder("custom_string", "value"), ScoreMode.None);
HasChildQueryBuilder.LateParsingQuery query = (HasChildQueryBuilder.LateParsingQuery) hasChildQueryBuilder.toQuery(shardContext);
Similarity expected = SimilarityService.BUILT_IN.get(similarity)
.create(similarity, Settings.EMPTY,
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(), null)
.get();
.apply(Settings.EMPTY, Version.CURRENT, null);
assertThat(((PerFieldSimilarityWrapper) query.getSimilarity()).get("custom_string"), instanceOf(expected.getClass()));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.similarities.Similarity;
import org.elasticsearch.Version;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.settings.IndexScopedSettings;
import org.elasticsearch.common.settings.Settings;
Expand All @@ -31,8 +33,8 @@
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.similarity.SimilarityService;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.indices.mapper.MapperRegistry;
import org.elasticsearch.script.ScriptService;

import java.util.AbstractMap;
import java.util.Collection;
Expand Down Expand Up @@ -142,22 +144,23 @@ private void checkMappingsCompatibility(IndexMetaData indexMetaData) {

IndexSettings indexSettings = new IndexSettings(indexMetaData, this.settings);

final Map<String, SimilarityProvider.Factory> similarityMap = new AbstractMap<String, SimilarityProvider.Factory>() {
final Map<String, TriFunction<Settings, Version, ScriptService, Similarity>> similarityMap
= new AbstractMap<String, TriFunction<Settings, Version, ScriptService, Similarity>>() {
@Override
public boolean containsKey(Object key) {
return true;
}

@Override
public SimilarityProvider.Factory get(Object key) {
public TriFunction<Settings, Version, ScriptService, Similarity> get(Object key) {
assert key instanceof String : "key must be a string but was: " + key.getClass();
return SimilarityService.BUILT_IN.get(SimilarityService.DEFAULT_SIMILARITY);
}

// this entrySet impl isn't fully correct but necessary as SimilarityService will iterate
// over all similarities
@Override
public Set<Entry<String, SimilarityProvider.Factory>> entrySet() {
public Set<Entry<String, TriFunction<Settings, Version, ScriptService, Similarity>>> entrySet() {
return Collections.emptySet();
}
};
Expand Down
24 changes: 15 additions & 9 deletions server/src/main/java/org/elasticsearch/index/IndexModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@

package org.elasticsearch.index;

import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.Version;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
Expand All @@ -39,9 +43,6 @@
import org.elasticsearch.index.shard.IndexSearcherWrapper;
import org.elasticsearch.index.shard.IndexingOperationListener;
import org.elasticsearch.index.shard.SearchOperationListener;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.similarity.BM25SimilarityProvider;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.index.similarity.SimilarityService;
import org.elasticsearch.index.store.IndexStore;
import org.elasticsearch.indices.IndicesQueryCache;
Expand All @@ -68,10 +69,10 @@
/**
* IndexModule represents the central extension point for index level custom implementations like:
* <ul>
* <li>{@link SimilarityProvider} - New {@link SimilarityProvider} implementations can be registered through
* {@link #addSimilarity(String, SimilarityProvider.Factory)} while existing Providers can be referenced through Settings under the
* <li>{@link Similarity} - New {@link Similarity} implementations can be registered through
* {@link #addSimilarity(String, TriFunction)} while existing Providers can be referenced through Settings under the
* {@link IndexModule#SIMILARITY_SETTINGS_PREFIX} prefix along with the "type" value. For example, to reference the
* {@link BM25SimilarityProvider}, the configuration <tt>"index.similarity.my_similarity.type : "BM25"</tt> can be used.</li>
* {@link BM25Similarity}, the configuration <tt>"index.similarity.my_similarity.type : "BM25"</tt> can be used.</li>
* <li>{@link IndexStore} - Custom {@link IndexStore} instances can be registered via {@link #addIndexStore(String, Function)}</li>
* <li>{@link IndexEventListener} - Custom {@link IndexEventListener} instances can be registered via
* {@link #addIndexEventListener(IndexEventListener)}</li>
Expand Down Expand Up @@ -107,7 +108,7 @@ public final class IndexModule {
final SetOnce<EngineFactory> engineFactory = new SetOnce<>();
private SetOnce<IndexSearcherWrapperFactory> indexSearcherWrapper = new SetOnce<>();
private final Set<IndexEventListener> indexEventListeners = new HashSet<>();
private final Map<String, SimilarityProvider.Factory> similarities = new HashMap<>();
private final Map<String, TriFunction<Settings, Version, ScriptService, Similarity>> similarities = new HashMap<>();
private final Map<String, Function<IndexSettings, IndexStore>> storeTypes = new HashMap<>();
private final SetOnce<BiFunction<IndexSettings, IndicesQueryCache, QueryCache>> forceQueryCacheProvider = new SetOnce<>();
private final List<SearchOperationListener> searchOperationListeners = new ArrayList<>();
Expand Down Expand Up @@ -246,12 +247,17 @@ public void addIndexStore(String type, Function<IndexSettings, IndexStore> provi


/**
* Registers the given {@link SimilarityProvider} with the given name
* Registers the given {@link Similarity} with the given name.
* The function takes as parameters:<ul>
* <li>settings for this similarity
* <li>version of Elasticsearch when the index was created
* <li>ScriptService, for script-based similarities
* </ul>
*
* @param name Name of the SimilarityProvider
* @param similarity SimilarityProvider to register
*/
public void addSimilarity(String name, SimilarityProvider.Factory similarity) {
public void addSimilarity(String name, TriFunction<Settings, Version, ScriptService, Similarity> similarity) {
ensureNotFrozen();
if (similarities.containsKey(name) || SimilarityService.BUILT_IN.containsKey(name)) {
throw new IllegalArgumentException("similarity for name: [" + name + " is already registered");
Expand Down

This file was deleted.

This file was deleted.

Loading

0 comments on commit 569d0c0

Please sign in to comment.