Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose splitOnWhitespace in Query String Query #20965

Merged
merged 3 commits into from
Nov 2, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ public void reset(QueryParserSettings settings) {
setDefaultOperator(settings.defaultOperator());
setFuzzyPrefixLength(settings.fuzzyPrefixLength());
setLocale(settings.locale());
setSplitOnWhitespace(settings.splitOnWhitespace());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ public class QueryParserSettings {
/** To limit effort spent determinizing regexp queries. */
private int maxDeterminizedStates;

private boolean splitOnWhitespace;

public QueryParserSettings(String queryString) {
this.queryString = queryString;
}
Expand Down Expand Up @@ -290,4 +292,12 @@ public void fuzziness(Fuzziness fuzziness) {
public Fuzziness fuzziness() {
return fuzziness;
}

public void splitOnWhitespace(boolean value) {
this.splitOnWhitespace = value;
}

public boolean splitOnWhitespace() {
return splitOnWhitespace;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.io.stream.StreamInput;
Expand Down Expand Up @@ -59,6 +60,8 @@
public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQueryBuilder> {
public static final String NAME = "query_string";

public static final Version V_5_1_0_UNRELEASED = Version.fromId(5010099);

public static final boolean DEFAULT_AUTO_GENERATE_PHRASE_QUERIES = false;
public static final int DEFAULT_MAX_DETERMINED_STATES = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
public static final boolean DEFAULT_LOWERCASE_EXPANDED_TERMS = true;
Expand All @@ -72,6 +75,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
public static final Fuzziness DEFAULT_FUZZINESS = Fuzziness.AUTO;
public static final Operator DEFAULT_OPERATOR = Operator.OR;
public static final Locale DEFAULT_LOCALE = Locale.ROOT;
public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true;

private static final ParseField QUERY_FIELD = new ParseField("query");
private static final ParseField FIELDS_FIELD = new ParseField("fields");
Expand All @@ -98,6 +102,7 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
private static final ParseField LENIENT_FIELD = new ParseField("lenient");
private static final ParseField LOCALE_FIELD = new ParseField("locale");
private static final ParseField TIME_ZONE_FIELD = new ParseField("time_zone");
private static final ParseField SPLIT_ON_WHITESPACE = new ParseField("split_on_whitespace");


private final String queryString;
Expand Down Expand Up @@ -159,6 +164,8 @@ public class QueryStringQueryBuilder extends AbstractQueryBuilder<QueryStringQue
/** To limit effort spent determinizing regexp queries. */
private int maxDeterminizedStates = DEFAULT_MAX_DETERMINED_STATES;

private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;

public QueryStringQueryBuilder(String queryString) {
if (queryString == null) {
throw new IllegalArgumentException("query text missing");
Expand Down Expand Up @@ -200,6 +207,11 @@ public QueryStringQueryBuilder(StreamInput in) throws IOException {
timeZone = in.readOptionalTimeZone();
escape = in.readBoolean();
maxDeterminizedStates = in.readVInt();
if (in.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) {
splitOnWhitespace = in.readBoolean();
} else {
splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
}
}

@Override
Expand Down Expand Up @@ -234,6 +246,9 @@ protected void doWriteTo(StreamOutput out) throws IOException {
out.writeOptionalTimeZone(timeZone);
out.writeBoolean(this.escape);
out.writeVInt(this.maxDeterminizedStates);
if (out.getVersion().onOrAfter(V_5_1_0_UNRELEASED)) {
out.writeBoolean(this.splitOnWhitespace);
}
}

public String queryString() {
Expand Down Expand Up @@ -570,6 +585,19 @@ public boolean escape() {
return this.escape;
}

/**
* Whether query text should be split on whitespace prior to analysis.
* Default is <code>{@value #DEFAULT_SPLIT_ON_WHITESPACE}</code>.
*/
public QueryStringQueryBuilder splitOnWhitespace(boolean value) {
this.splitOnWhitespace = value;
return this;
}

public boolean splitOnWhitespace() {
return splitOnWhitespace;
}

@Override
protected void doXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
Expand Down Expand Up @@ -626,6 +654,7 @@ protected void doXContent(XContentBuilder builder, Params params) throws IOExcep
builder.field(TIME_ZONE_FIELD.getPreferredName(), this.timeZone.getID());
}
builder.field(ESCAPE_FIELD.getPreferredName(), this.escape);
builder.field(SPLIT_ON_WHITESPACE.getPreferredName(), this.splitOnWhitespace);
printBoostAndQueryName(builder);
builder.endObject();
}
Expand Down Expand Up @@ -661,6 +690,7 @@ public static Optional<QueryStringQueryBuilder> fromXContent(QueryParseContext p
Fuzziness fuzziness = QueryStringQueryBuilder.DEFAULT_FUZZINESS;
String fuzzyRewrite = null;
String rewrite = null;
boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
Map<String, Float> fieldsAndWeights = new HashMap<>();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
Expand Down Expand Up @@ -750,6 +780,8 @@ public static Optional<QueryStringQueryBuilder> fromXContent(QueryParseContext p
}
} else if (parseContext.getParseFieldMatcher().match(currentFieldName, AbstractQueryBuilder.NAME_FIELD)) {
queryName = parser.text();
} else if (parseContext.getParseFieldMatcher().match(currentFieldName, SPLIT_ON_WHITESPACE)) {
splitOnWhitespace = parser.booleanValue();
} else {
throw new ParsingException(parser.getTokenLocation(), "[" + QueryStringQueryBuilder.NAME +
"] query does not support [" + currentFieldName + "]");
Expand Down Expand Up @@ -791,6 +823,7 @@ public static Optional<QueryStringQueryBuilder> fromXContent(QueryParseContext p
queryStringQuery.locale(locale);
queryStringQuery.boost(boost);
queryStringQuery.queryName(queryName);
queryStringQuery.splitOnWhitespace(splitOnWhitespace);
return Optional.of(queryStringQuery);
}

Expand Down Expand Up @@ -827,7 +860,8 @@ protected boolean doEquals(QueryStringQueryBuilder other) {
timeZone == null ? other.timeZone == null : other.timeZone != null &&
Objects.equals(timeZone.getID(), other.timeZone.getID()) &&
Objects.equals(escape, other.escape) &&
Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates);
Objects.equals(maxDeterminizedStates, other.maxDeterminizedStates) &&
Objects.equals(splitOnWhitespace, other.splitOnWhitespace);
}

@Override
Expand All @@ -836,7 +870,7 @@ protected int doHashCode() {
quoteFieldSuffix, autoGeneratePhraseQueries, allowLeadingWildcard, lowercaseExpandedTerms,
enablePositionIncrements, analyzeWildcard, locale.toLanguageTag(), fuzziness, fuzzyPrefixLength,
fuzzyMaxExpansions, fuzzyRewrite, phraseSlop, useDisMax, tieBreaker, rewrite, minimumShouldMatch, lenient,
timeZone == null ? 0 : timeZone.getID(), escape, maxDeterminizedStates);
timeZone == null ? 0 : timeZone.getID(), escape, maxDeterminizedStates, splitOnWhitespace);
}

@Override
Expand Down Expand Up @@ -904,6 +938,7 @@ protected Query doToQuery(QueryShardContext context) throws IOException {
qpSettings.lenient(lenient == null ? context.queryStringLenient() : lenient);
qpSettings.timeZone(timeZone);
qpSettings.maxDeterminizedStates(maxDeterminizedStates);
qpSettings.splitOnWhitespace(splitOnWhitespace);

MapperQueryParser queryParser = context.queryParser(qpSettings);
Query query;
Expand Down
2 changes: 2 additions & 0 deletions core/src/test/java/org/elasticsearch/VersionTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.monitor.os.OsStats;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
import org.elasticsearch.search.internal.AliasFilter;
Expand Down Expand Up @@ -275,6 +276,7 @@ public void testUnknownVersions() {
assertUnknownVersion(AliasFilter.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant
assertUnknownVersion(OsStats.V_5_1_0); // once we released 5.1.0 and it's added to Version.java we need to remove this constant
assertUnknownVersion(SimpleQueryStringBuilder.V_5_1_0_UNRELEASED);
assertUnknownVersion(QueryStringQueryBuilder.V_5_1_0_UNRELEASED);
// once we released 5.0.0 and it's added to Version.java we need to remove this constant
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.joda.time.DateTimeZone;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

Expand Down Expand Up @@ -151,6 +152,7 @@ protected QueryStringQueryBuilder doCreateTestQueryBuilder() {
if (randomBoolean()) {
queryStringQueryBuilder.timeZone(randomDateTimeZone().getID());
}
queryStringQueryBuilder.splitOnWhitespace(randomBoolean());
return queryStringQueryBuilder;
}

Expand Down Expand Up @@ -532,6 +534,128 @@ public void testToQueryPhraseQueryBoostAndSlop() throws IOException {
assertThat(phraseQuery.getTerms().length, equalTo(2));
}

public void testToQuerySplitOnWhitespace() throws IOException {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
// splitOnWhitespace=false
{
QueryStringQueryBuilder queryBuilder =
new QueryStringQueryBuilder("foo bar")
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
.splitOnWhitespace(false);
Query query = queryBuilder.toQuery(createShardContext());
BooleanQuery bq1 =
new BooleanQuery.Builder()
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD))
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD))
.build();
List<Query> disjuncts = new ArrayList<>();
disjuncts.add(bq1);
disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar")));
DisjunctionMaxQuery expectedQuery = new DisjunctionMaxQuery(disjuncts, 0.0f);
assertThat(query, equalTo(expectedQuery));
}

{
QueryStringQueryBuilder queryBuilder =
new QueryStringQueryBuilder("mapped_string:other foo bar")
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
.splitOnWhitespace(false);
Query query = queryBuilder.toQuery(createShardContext());
BooleanQuery bq1 =
new BooleanQuery.Builder()
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "foo")), BooleanClause.Occur.SHOULD))
.add(new BooleanClause(new TermQuery(new Term(STRING_FIELD_NAME, "bar")), BooleanClause.Occur.SHOULD))
.build();
List<Query> disjuncts = new ArrayList<>();
disjuncts.add(bq1);
disjuncts.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo bar")));
DisjunctionMaxQuery disjunctionMaxQuery = new DisjunctionMaxQuery(disjuncts, 0.0f);
BooleanQuery expectedQuery =
new BooleanQuery.Builder()
.add(disjunctionMaxQuery, BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(STRING_FIELD_NAME, "other")), BooleanClause.Occur.SHOULD)
.build();
assertThat(query, equalTo(expectedQuery));
}

{
QueryStringQueryBuilder queryBuilder =
new QueryStringQueryBuilder("foo OR bar")
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
.splitOnWhitespace(false);
Query query = queryBuilder.toQuery(createShardContext());

List<Query> disjuncts1 = new ArrayList<>();
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo")));
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo")));
DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f);

List<Query> disjuncts2 = new ArrayList<>();
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar")));
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar")));
DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f);

BooleanQuery expectedQuery =
new BooleanQuery.Builder()
.add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD))
.add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD))
.build();
assertThat(query, equalTo(expectedQuery));
}

// split_on_whitespace=false breaks range query with simple syntax
{
// throws an exception when lenient is set to false
QueryStringQueryBuilder queryBuilder =
new QueryStringQueryBuilder(">10 foo")
.field(INT_FIELD_NAME)
.splitOnWhitespace(false);
IllegalArgumentException exc =
expectThrows(IllegalArgumentException.class, () -> queryBuilder.toQuery(createShardContext()));
assertThat(exc.getMessage(), equalTo("For input string: \"10 foo\""));
}

{
// returns an empty boolean query when lenient is set to true
QueryStringQueryBuilder queryBuilder =
new QueryStringQueryBuilder(">10 foo")
.field(INT_FIELD_NAME)
.splitOnWhitespace(false)
.lenient(true);
Query query = queryBuilder.toQuery(createShardContext());
BooleanQuery bq = new BooleanQuery.Builder().build();
assertThat(bq, equalTo(query));
}

// splitOnWhitespace=true
{
QueryStringQueryBuilder queryBuilder =
new QueryStringQueryBuilder("foo bar")
.field(STRING_FIELD_NAME).field(STRING_FIELD_NAME_2)
.splitOnWhitespace(true);
Query query = queryBuilder.toQuery(createShardContext());

List<Query> disjuncts1 = new ArrayList<>();
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME, "foo")));
disjuncts1.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "foo")));
DisjunctionMaxQuery maxQuery1 = new DisjunctionMaxQuery(disjuncts1, 0.0f);

List<Query> disjuncts2 = new ArrayList<>();
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME, "bar")));
disjuncts2.add(new TermQuery(new Term(STRING_FIELD_NAME_2, "bar")));
DisjunctionMaxQuery maxQuery2 = new DisjunctionMaxQuery(disjuncts2, 0.0f);

BooleanQuery expectedQuery =
new BooleanQuery.Builder()
.add(new BooleanClause(maxQuery1, BooleanClause.Occur.SHOULD))
.add(new BooleanClause(maxQuery2, BooleanClause.Occur.SHOULD))
.build();
assertThat(query, equalTo(expectedQuery));
}


}

public void testFromJson() throws IOException {
String json =
"{\n" +
Expand All @@ -552,6 +676,7 @@ public void testFromJson() throws IOException {
" \"phrase_slop\" : 0,\n" +
" \"locale\" : \"und\",\n" +
" \"escape\" : false,\n" +
" \"split_on_whitespace\" : true,\n" +
" \"boost\" : 1.0\n" +
" }\n" +
"}";
Expand Down
5 changes: 5 additions & 0 deletions docs/reference/query-dsl/query-string-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ http://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html[JODA timez
the query string. This allows to use a field that has a different analysis chain
for exact matching. Look <<mixing-exact-search-with-stemming,here>> for a
comprehensive example.

|`split_on_whitespace` |Whether query text should be split on whitespace prior to analysis.
Instead the queryparser would parse around only real 'operators'.
Default to `false`.

|=======================================================================

When a multi term query is being generated, one can control how it gets
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/query-dsl/query-string-syntax.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,8 @@ A space may also be a reserved character. For instance, if you have a
synonym list which converts `"wi fi"` to `"wifi"`, a `query_string` search
for `"wi fi"` would fail. The query string parser would interpret your
query as a search for `"wi OR fi"`, while the token stored in your
index is actually `"wifi"`. Escaping the space will protect it from
being touched by the query string parser: `"wi\ fi"`.
index is actually `"wifi"`. The option `split_on_whitespace=false` will protect it from
being touched by the query string parser and will let the analysis run on the entire input (`"wi fi"`).
****

===== Empty Query
Expand Down