From eb10c37b928d5dc960b7c202d2b530f82a053fb9 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 12 Oct 2016 09:52:03 +0200 Subject: [PATCH 1/3] Expose splitOnWhitespace in `Query String Query` This change adds an option called `split_on_whitespace` which prevents the query parser to split free text part on whitespace prior to analysis. Instead the queryparser would parse around only real 'operators'. Default to true. For instance the query `"foo bar"` would let the analyzer of the targeted field decide how the tokens should be splitted. Some options are missing in this change but I'd like to add them in a follow up PR in order to be able to simplify the backport in 5.x. The missing options (changes) are: * A `type` option which similarly to the `multi_match` query defines how the free text should be parsed when multi fields are defined. * Simple range query with additional tokens like ">100 50" are broken when `split_on_whitespace` is set to false. It should be possible to preserve this syntax and make the parser aware of this special syntax even when `split_on_whitespace` is set to false. * Since all this options would make the `query_string_query` very similar to a match (multi_match) query we should be able to share the code that produce the final Lucene query. --- .../classic/MapperQueryParser.java | 1 + .../classic/QueryParserSettings.java | 10 ++ .../index/query/QueryStringQueryBuilder.java | 30 ++++- .../query/QueryStringQueryBuilderTests.java | 125 ++++++++++++++++++ .../query-dsl/query-string-query.asciidoc | 5 + .../query-dsl/query-string-syntax.asciidoc | 4 +- 6 files changed, 171 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java index b6f0020fb5bc7..bbb27aa11db4b 100644 --- a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java +++ b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java @@ -104,6 +104,7 @@ public void reset(QueryParserSettings settings) { setDefaultOperator(settings.defaultOperator()); setFuzzyPrefixLength(settings.fuzzyPrefixLength()); setLocale(settings.locale()); + setSplitOnWhitespace(settings.splitOnWhitespace()); } /** diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java index c1fc2ae556ea7..a03c84d89803a 100644 --- a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java +++ b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java @@ -79,6 +79,8 @@ public class QueryParserSettings { /** To limit effort spent determinizing regexp queries. */ private int maxDeterminizedStates; + private boolean splitOnWhitespace; + public QueryParserSettings(String queryString) { this.queryString = queryString; } @@ -290,4 +292,12 @@ public void fuzziness(Fuzziness fuzziness) { public Fuzziness fuzziness() { return fuzziness; } + + public void splitOnWhitespace(boolean value) { + this.splitOnWhitespace = value; + } + + public boolean splitOnWhitespace() { + return splitOnWhitespace; + } } diff --git a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java index 807343237d240..52625747ca38c 100644 --- a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java +++ b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java @@ -72,6 +72,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder{@value #DEFAULT_SPLIT_ON_WHITESPACE}. + */ + public QueryStringQueryBuilder splitOnWhitespace(boolean value) { + this.splitOnWhitespace = value; + return this; + } + + public boolean splitOnWhitespace() { + return splitOnWhitespace; + } + @Override protected void doXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(NAME); @@ -626,6 +645,7 @@ protected void doXContent(XContentBuilder builder, Params params) throws IOExcep builder.field(TIME_ZONE_FIELD.getPreferredName(), this.timeZone.getID()); } builder.field(ESCAPE_FIELD.getPreferredName(), this.escape); + builder.field(SPLIT_ON_WHITESPACE.getPreferredName(), this.splitOnWhitespace); printBoostAndQueryName(builder); builder.endObject(); } @@ -661,6 +681,7 @@ public static Optional fromXContent(QueryParseContext p Fuzziness fuzziness = QueryStringQueryBuilder.DEFAULT_FUZZINESS; String fuzzyRewrite = null; String rewrite = null; + boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE; Map fieldsAndWeights = new HashMap<>(); while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { @@ -750,6 +771,8 @@ public static Optional fromXContent(QueryParseContext p } } else if (parseContext.getParseFieldMatcher().match(currentFieldName, AbstractQueryBuilder.NAME_FIELD)) { queryName = parser.text(); + } else if (parseContext.getParseFieldMatcher().match(currentFieldName, SPLIT_ON_WHITESPACE)) { + splitOnWhitespace = parser.booleanValue(); } else { throw new ParsingException(parser.getTokenLocation(), "[" + QueryStringQueryBuilder.NAME + "] query does not support [" + currentFieldName + "]"); @@ -791,6 +814,7 @@ public static Optional fromXContent(QueryParseContext p queryStringQuery.locale(locale); queryStringQuery.boost(boost); queryStringQuery.queryName(queryName); + queryStringQuery.splitOnWhitespace(splitOnWhitespace); return Optional.of(queryStringQuery); } @@ -827,7 +851,8 @@ protected boolean doEquals(QueryStringQueryBuilder other) { timeZone == null ? other.timeZone == null : other.timeZone != null && Objects.equals(timeZone.getID(), other.timeZone.getID()) && Objects.equals(escape, other.escape) && - Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates); + Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates) && + Objects.equals(splitOnWhitespace, other.splitOnWhitespace); } @Override @@ -836,7 +861,7 @@ protected int doHashCode() { quoteFieldSuffix, autoGeneratePhraseQueries, allowLeadingWildcard, lowercaseExpandedTerms, enablePositionIncrements, analyzeWildcard, locale.toLanguageTag(), fuzziness, fuzzyPrefixLength, fuzzyMaxExpansions, fuzzyRewrite, phraseSlop, useDisMax, tieBreaker, rewrite, minimumShouldMatch, lenient, - timeZone == null ? 0 : timeZone.getID(), escape, maxDeterminizedStates); + timeZone == null ? 0 : timeZone.getID(), escape, maxDeterminizedStates, splitOnWhitespace); } @Override @@ -904,6 +929,7 @@ protected Query doToQuery(QueryShardContext context) throws IOException { qpSettings.lenient(lenient == null ? context.queryStringLenient() : lenient); qpSettings.timeZone(timeZone); qpSettings.maxDeterminizedStates(maxDeterminizedStates); + qpSettings.splitOnWhitespace(splitOnWhitespace); MapperQueryParser queryParser = context.queryParser(qpSettings); Query query; diff --git a/core/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java b/core/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java index 353d5704d4d1d..1241fde362471 100644 --- a/core/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java +++ b/core/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java @@ -45,6 +45,7 @@ import org.joda.time.DateTimeZone; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -151,6 +152,7 @@ protected QueryStringQueryBuilder doCreateTestQueryBuilder() { if (randomBoolean()) { queryStringQueryBuilder.timeZone(randomDateTimeZone().getID()); } + queryStringQueryBuilder.splitOnWhitespace(randomBoolean()); return queryStringQueryBuilder; } @@ -532,6 +534,128 @@ public void testToQueryPhraseQueryBoostAndSlop() throws IOException { assertThat(phraseQuery.getTerms().length, equalTo(2)); } + public void testToQuerySplitOnWhitespace() throws IOException { + assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0); + // splitOnWhitespace=false + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("foo bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(false); + Query query = queryBuilder.toQuery(createShardContext()); + BooleanQuery bq1 = + new BooleanQuery.Builder() + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD)) + .build(); + List disjuncts = new ArrayList<>(); + disjuncts.add(bq1); + disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar"))); + DisjunctionMaxQuery expectedQuery = new DisjunctionMaxQuery(disjuncts, 0.0f); + assertThat(query, equalTo(expectedQuery)); + } + + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("mapped_string:other foo bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(false); + Query query = queryBuilder.toQuery(createShardContext()); + BooleanQuery bq1 = + new BooleanQuery.Builder() + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD)) + .build(); + List disjuncts = new ArrayList<>(); + disjuncts.add(bq1); + disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar"))); + DisjunctionMaxQuery disjunctionMaxQuery = new DisjunctionMaxQuery(disjuncts, 0.0f); + BooleanQuery expectedQuery = + new BooleanQuery.Builder() + .add(disjunctionMaxQuery, BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term(STRING_FIELD_NAME, "other")), BooleanClause.Occur.SHOULD) + .build(); + assertThat(query, equalTo(expectedQuery)); + } + + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("foo OR bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(false); + Query query = queryBuilder.toQuery(createShardContext()); + + List disjuncts1 = new ArrayList<>(); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo"))); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo"))); + DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f); + + List disjuncts2 = new ArrayList<>(); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar"))); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar"))); + DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f); + + BooleanQuery expectedQuery = + new BooleanQuery.Builder() + .add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD)) + .build(); + assertThat(query, equalTo(expectedQuery)); + } + + // split_on_whitespace=false breaks range query with simple syntax + { + // throws an exception when lenient is set to false + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder(">10 foo") + .field(INT_FIELD_NAME) + .splitOnWhitespace(false); + IllegalArgumentException exc = + expectThrows(IllegalArgumentException.class, () -> queryBuilder.toQuery(createShardContext())); + assertThat(exc.getMessage(), equalTo("For input string: \"10 foo\"")); + } + + { + // returns an empty boolean query when lenient is set to true + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder(">10 foo") + .field(INT_FIELD_NAME) + .splitOnWhitespace(false) + .lenient(true); + Query query = queryBuilder.toQuery(createShardContext()); + BooleanQuery bq = new BooleanQuery.Builder().build(); + assertThat(bq, equalTo(query)); + } + + // splitOnWhitespace=true + { + QueryStringQueryBuilder queryBuilder = + new QueryStringQueryBuilder("foo bar") + .field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2) + .splitOnWhitespace(true); + Query query = queryBuilder.toQuery(createShardContext()); + + List disjuncts1 = new ArrayList<>(); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo"))); + disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo"))); + DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f); + + List disjuncts2 = new ArrayList<>(); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar"))); + disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar"))); + DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f); + + BooleanQuery expectedQuery = + new BooleanQuery.Builder() + .add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD)) + .build(); + assertThat(query, equalTo(expectedQuery)); + } + + + } + public void testFromJson() throws IOException { String json = "{\n" + @@ -552,6 +676,7 @@ public void testFromJson() throws IOException { " \"phrase_slop\" : 0,\n" + " \"locale\" : \"und\",\n" + " \"escape\" : false,\n" + + " \"split_on_whitespace\" : true,\n" + " \"boost\" : 1.0\n" + " }\n" + "}"; diff --git a/docs/reference/query-dsl/query-string-query.asciidoc b/docs/reference/query-dsl/query-string-query.asciidoc index fe113b0a28965..b8ce68721a1e8 100644 --- a/docs/reference/query-dsl/query-string-query.asciidoc +++ b/docs/reference/query-dsl/query-string-query.asciidoc @@ -90,6 +90,11 @@ http://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html[JODA timez the query string. This allows to use a field that has a different analysis chain for exact matching. Look <> for a comprehensive example. + +|`split_on_whitespace` |Whether query text should be split on whitespace prior to analysis. + Instead the queryparser would parse around only real 'operators'. + Default to `false`. + |======================================================================= When a multi term query is being generated, one can control how it gets diff --git a/docs/reference/query-dsl/query-string-syntax.asciidoc b/docs/reference/query-dsl/query-string-syntax.asciidoc index 9e847102469cf..49a8b54c5062b 100644 --- a/docs/reference/query-dsl/query-string-syntax.asciidoc +++ b/docs/reference/query-dsl/query-string-syntax.asciidoc @@ -282,8 +282,8 @@ A space may also be a reserved character. For instance, if you have a synonym list which converts `"wi fi"` to `"wifi"`, a `query_string` search for `"wi fi"` would fail. The query string parser would interpret your query as a search for `"wi OR fi"`, while the token stored in your -index is actually `"wifi"`. Escaping the space will protect it from -being touched by the query string parser: `"wi\ fi"`. +index is actually `"wifi"`. The option `split_on_whitespace=false` will protect it from +being touched by the query string parser and will let the analysis run on the entire input (`"wi fi"`). **** ===== Empty Query From 11d0077d9d722c5363915f700093a4becf75f765 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 28 Oct 2016 09:56:46 +0200 Subject: [PATCH 2/3] add serialization protection based on the stream version --- .../index/query/QueryStringQueryBuilder.java | 11 +++++++++-- .../src/test/java/org/elasticsearch/VersionTests.java | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java index 52625747ca38c..a93ab86d8f743 100644 --- a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java +++ b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java @@ -26,6 +26,7 @@ import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.io.stream.StreamInput; @@ -59,6 +60,8 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder { public static final String NAME = "query_string"; + public static final Version V_5_1_0_UNRELEASED = Version.fromId(5010099); + public static final boolean DEFAULT_AUTO_GENERATE_PHRASE_QUERIES = false; public static final int DEFAULT_MAX_DETERMINED_STATES = Operations.DEFAULT_MAX_DETERMINIZED_STATES; public static final boolean DEFAULT_LOWERCASE_EXPANDED_TERMS = true; @@ -204,7 +207,9 @@ public QueryStringQueryBuilder(StreamInput in) throws IOException { timeZone = in.readOptionalTimeZone(); escape = in.readBoolean(); maxDeterminizedStates = in.readVInt(); - splitOnWhitespace = in.readBoolean(); + if (in.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) { + splitOnWhitespace = in.readBoolean(); + } } @Override @@ -239,7 +244,9 @@ protected void doWriteTo(StreamOutput out) throws IOException { out.writeOptionalTimeZone(timeZone); out.writeBoolean(this.escape); out.writeVInt(this.maxDeterminizedStates); - out.writeBoolean(this.splitOnWhitespace); + if (out.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) { + out.writeBoolean(this.splitOnWhitespace); + } } public String queryString() { diff --git a/core/src/test/java/org/elasticsearch/VersionTests.java b/core/src/test/java/org/elasticsearch/VersionTests.java index 42ea115bf4308..167a36a96d18e 100644 --- a/core/src/test/java/org/elasticsearch/VersionTests.java +++ b/core/src/test/java/org/elasticsearch/VersionTests.java @@ -23,6 +23,7 @@ import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.query.QueryStringQueryBuilder; import org.elasticsearch.monitor.os.OsStats; import org.elasticsearch.index.query.SimpleQueryStringBuilder; import org.elasticsearch.search.internal.AliasFilter; @@ -275,6 +276,7 @@ public void testUnknownVersions() { assertUnknownVersion(AliasFilter.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant assertUnknownVersion(OsStats.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant assertUnknownVersion(SimpleQueryStringBuilder.V_5_1_0_UNRELEASED); + assertUnknownVersion(QueryStringQueryBuilder.V_5_1_0_UNRELEASED); // once we released 5.0.0 and it's added to Version.java we need to remove this constant } From 3ee2faef92d295b6600defce2313c0bf9c0824f3 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 2 Nov 2016 09:58:50 +0100 Subject: [PATCH 3/3] explicitly set default value for split_on_whitespace for versions prior to 5.1 --- .../org/elasticsearch/index/query/QueryStringQueryBuilder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java index a93ab86d8f743..d88484c98a62e 100644 --- a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java +++ b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryBuilder.java @@ -209,6 +209,8 @@ public QueryStringQueryBuilder(StreamInput in) throws IOException { maxDeterminizedStates = in.readVInt(); if (in.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) { splitOnWhitespace = in.readBoolean(); + } else { + splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE; } }