From a26b9b76a7ebc017e1a9421b1e2200d89dc04a9d Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 18 Jun 2018 09:46:12 +0100 Subject: [PATCH] Expose lucene's RemoveDuplicatesTokenFilter (#31275) --- docs/reference/analysis/tokenfilters.asciidoc | 4 +- .../remove-duplicates-tokenfilter.asciidoc | 5 ++ .../analysis/common/CommonAnalysisPlugin.java | 1 + .../RemoveDuplicatesTokenFilterFactory.java | 42 +++++++++++++ .../RemoveDuplicatesFilterFactoryTests.java | 61 +++++++++++++++++++ 5 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RemoveDuplicatesTokenFilterFactory.java create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RemoveDuplicatesFilterFactoryTests.java diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index 6e77b4498650d..dd5cb2e702cff 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -95,4 +95,6 @@ include::tokenfilters/decimal-digit-tokenfilter.asciidoc[] include::tokenfilters/fingerprint-tokenfilter.asciidoc[] -include::tokenfilters/minhash-tokenfilter.asciidoc[] \ No newline at end of file +include::tokenfilters/minhash-tokenfilter.asciidoc[] + +include::tokenfilters/remove-duplicates-tokenfilter.asciidoc[] \ No newline at end of file diff --git a/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc new file mode 100644 index 0000000000000..594e18eaf7f7e --- /dev/null +++ b/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc @@ -0,0 +1,5 @@ +[[analysis-remove-duplicates-tokenfilter]] +=== Remove Duplicates Token Filter + +A token filter of type `remove_duplicates` that drops identical tokens at the +same position. diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 722d75a9293f7..04df77245438c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -194,6 +194,7 @@ public Map> getTokenFilters() { filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); + filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new); filters.put("reverse", ReverseTokenFilterFactory::new); filters.put("russian_stem", RussianStemTokenFilterFactory::new); filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RemoveDuplicatesTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RemoveDuplicatesTokenFilterFactory.java new file mode 100644 index 0000000000000..a136c5573121e --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RemoveDuplicatesTokenFilterFactory.java @@ -0,0 +1,42 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; + +/** + * Filter factory for the lucene RemoveDuplicatesTokenFilter + */ +class RemoveDuplicatesTokenFilterFactory extends AbstractTokenFilterFactory { + + RemoveDuplicatesTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new RemoveDuplicatesTokenFilter(tokenStream); + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RemoveDuplicatesFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RemoveDuplicatesFilterFactoryTests.java new file mode 100644 index 0000000000000..8180985416f52 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RemoveDuplicatesFilterFactoryTests.java @@ -0,0 +1,61 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.Token; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; + +import static org.hamcrest.Matchers.instanceOf; + +public class RemoveDuplicatesFilterFactoryTests extends ESTokenStreamTestCase { + + public void testRemoveDuplicatesFilter() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.removedups.type", "remove_duplicates") + .build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("removedups"); + assertThat(tokenFilter, instanceOf(RemoveDuplicatesTokenFilterFactory.class)); + + CannedTokenStream cts = new CannedTokenStream( + new Token("a", 1, 0, 1), + new Token("b", 1, 2, 3), + new Token("c", 0, 2, 3), + new Token("b", 0, 2, 3), + new Token("d", 1, 4, 5) + ); + + assertTokenStreamContents(tokenFilter.create(cts), new String[]{ + "a", "b", "c", "d" + }, new int[]{ + 1, 1, 0, 1 + }); + } + +}