[Bug] fix case sensitivity for wildcard queries (#5462)

Fixes the wildcard query to not normalize the pattern when case_insensitive is set by the user. This is achieved by creating a new normalizedWildcardQuery method so that query_string queries (which do not support case sensitivity) can still normalize the pattern when the default analyzer is used; maintaining existing behavior. Signed-off-by: Nicholas Walter Knize <nknize@apache.org>
opensearch-project · Dec 8, 2022 · ce25dec · ce25dec
1 parent 2416d37
commit ce25dec
Show file tree

Hide file tree

Showing 9 changed files with 130 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -99,6 +99,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Removed
 ### Fixed
 - Fix 1.x compatibility bug with stored Tasks ([#5412](https://github.com/opensearch-project/OpenSearch/pull/5412))
+- Fix case sensitivity for wildcard queries ([#5462](https://github.com/opensearch-project/OpenSearch/pull/5462))
 ### Security
 
 [Unreleased 3.0]: https://github.com/opensearch-project/OpenSearch/compare/2.4...HEAD

diff --git a/server/src/internalClusterTest/java/org/opensearch/search/query/QueryStringIT.java b/server/src/internalClusterTest/java/org/opensearch/search/query/QueryStringIT.java
@@ -216,6 +216,39 @@ public void testKeywordWithWhitespace() throws Exception {
         assertHitCount(resp, 3L);
     }
 
+    public void testRegexCaseInsensitivity() throws Exception {
+        createIndex("messages");
+        List<IndexRequestBuilder> indexRequests = new ArrayList<>();
+        indexRequests.add(client().prepareIndex("messages").setId("1").setSource("message", "message: this is a TLS handshake"));
+        indexRequests.add(client().prepareIndex("messages").setId("2").setSource("message", "message: this is a tcp handshake"));
+        indexRandom(true, false, indexRequests);
+
+        SearchResponse response = client().prepareSearch("messages").setQuery(queryStringQuery("/TLS/").defaultField("message")).get();
+        assertNoFailures(response);
+        assertHitCount(response, 1);
+        assertHits(response.getHits(), "1");
+
+        response = client().prepareSearch("messages").setQuery(queryStringQuery("/tls/").defaultField("message")).get();
+        assertNoFailures(response);
+        assertHitCount(response, 1);
+        assertHits(response.getHits(), "1");
+
+        response = client().prepareSearch("messages").setQuery(queryStringQuery("/TCP/").defaultField("message")).get();
+        assertNoFailures(response);
+        assertHitCount(response, 1);
+        assertHits(response.getHits(), "2");
+
+        response = client().prepareSearch("messages").setQuery(queryStringQuery("/tcp/").defaultField("message")).get();
+        assertNoFailures(response);
+        assertHitCount(response, 1);
+        assertHits(response.getHits(), "2");
+
+        response = client().prepareSearch("messages").setQuery(queryStringQuery("/HANDSHAKE/").defaultField("message")).get();
+        assertNoFailures(response);
+        assertHitCount(response, 2);
+        assertHits(response.getHits(), "1", "2");
+    }
+
     public void testAllFields() throws Exception {
         String indexBody = copyToStringFromClasspath("/org/opensearch/search/query/all-query-index.json");
 

diff --git a/server/src/internalClusterTest/java/org/opensearch/search/query/SearchQueryIT.java b/server/src/internalClusterTest/java/org/opensearch/search/query/SearchQueryIT.java
@@ -89,12 +89,15 @@
 import java.time.format.DateTimeFormatter;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.Map;
 import java.util.Random;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
 import java.util.regex.Pattern;
 
 import static java.util.Collections.singletonMap;
+import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.opensearch.action.support.WriteRequest.RefreshPolicy.IMMEDIATE;
 import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
 import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder;
@@ -2089,8 +2092,14 @@ public void testWildcardQueryNormalizationOnTextField() {
         refresh();
 
         {
+            // test default case insensitivity: false
             WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*");
             SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
+            assertHitCount(searchResponse, 0L);
+
+            // test case insensitivity set to true
+            wildCardQuery = wildcardQuery("field1", "Bb*").caseInsensitive(true);
+            searchResponse = client().prepareSearch().setQuery(wildCardQuery).get();
             assertHitCount(searchResponse, 1L);
 
             wildCardQuery = wildcardQuery("field1", "bb*");
@@ -2099,6 +2108,24 @@ public void testWildcardQueryNormalizationOnTextField() {
         }
     }
 
+    /** tests wildcard case sensitivity */
+    public void testWildcardCaseSensitivity() {
+        assertAcked(prepareCreate("test").setMapping("field", "type=text"));
+        client().prepareIndex("test").setId("1").setSource("field", "lowercase text").get();
+        refresh();
+
+        // test case sensitive
+        SearchResponse response = client().prepareSearch("test").setQuery(wildcardQuery("field", "Text").caseInsensitive(false)).get();
+        assertNoFailures(response);
+        assertHitCount(response, 0);
+
+        // test case insensitive
+        response = client().prepareSearch("test").setQuery(wildcardQuery("field", "Text").caseInsensitive(true)).get();
+        assertNoFailures(response);
+        assertHitCount(response, 1);
+        assertHits(response.getHits(), "1");
+    }
+
     /**
      * Reserved characters should be excluded when the normalization is applied for keyword fields.
      * See https://github.com/elastic/elasticsearch/issues/46300 for details.
@@ -2175,4 +2202,16 @@ public void testIssueFuzzyInsideSpanMulti() {
         SearchResponse response = client().prepareSearch("test").setQuery(query).get();
         assertHitCount(response, 1);
     }
+
+    /**
+     * asserts the search response hits include the expected ids
+     */
+    private void assertHits(SearchHits hits, String... ids) {
+        assertThat(hits.getTotalHits().value, equalTo((long) ids.length));
+        Set<String> hitIds = new HashSet<>();
+        for (SearchHit hit : hits.getHits()) {
+            hitIds.add(hit.getId());
+        }
+        assertThat(hitIds, containsInAnyOrder(ids));
+    }
 }
diff --git a/server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java
@@ -38,7 +38,10 @@
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.Nullable;
 import org.opensearch.common.lucene.Lucene;
 import org.opensearch.common.xcontent.XContentParser;
 import org.opensearch.index.analysis.IndexAnalyzers;
@@ -368,6 +371,18 @@ protected BytesRef indexedValueForSearch(Object value) {
             }
             return getTextSearchInfo().getSearchAnalyzer().normalize(name(), value.toString());
         }
+
+        @Override
+        public Query wildcardQuery(
+            String value,
+            @Nullable MultiTermQuery.RewriteMethod method,
+            boolean caseInsensitve,
+            QueryShardContext context
+        ) {
+            // keyword field types are always normalized, so ignore case sensitivity and force normalize the wildcard
+            // query text
+            return super.wildcardQuery(value, method, caseInsensitve, true, context);
+        }
     }
 
     private final boolean indexed;

diff --git a/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java
@@ -281,7 +281,7 @@ public Query prefixQuery(
     ) {
         throw new QueryShardException(
             context,
-            "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name + "] which is of type [" + typeName() + "]"
+            "Can only use prefix queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
         );
     }
 
@@ -290,6 +290,7 @@ public final Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteM
         return wildcardQuery(value, method, false, context);
     }
 
+    /** optionally normalize the wildcard pattern based on the value of {@code caseInsensitive} */
     public Query wildcardQuery(
         String value,
         @Nullable MultiTermQuery.RewriteMethod method,
@@ -298,11 +299,15 @@ public Query wildcardQuery(
     ) {
         throw new QueryShardException(
             context,
-            "Can only use wildcard queries on keyword, text and wildcard fields - not on ["
-                + name
-                + "] which is of type ["
-                + typeName()
-                + "]"
+            "Can only use wildcard queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
+        );
+    }
+
+    /** always normalizes the wildcard pattern to lowercase */
+    public Query normalizedWildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
+        throw new QueryShardException(
+            context,
+            "Can only use wildcard queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
         );
     }
 

diff --git a/server/src/main/java/org/opensearch/index/mapper/StringFieldType.java b/server/src/main/java/org/opensearch/index/mapper/StringFieldType.java
@@ -152,8 +152,34 @@ public static final String normalizeWildcardPattern(String fieldname, String val
         return sb.toBytesRef().utf8ToString();
     }
 
+    /** optionally normalize the wildcard pattern based on the value of {@code caseInsensitive} */
     @Override
     public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
+        return wildcardQuery(value, method, caseInsensitive, false, context);
+    }
+
+    /** always normalizes the wildcard pattern to lowercase */
+    @Override
+    public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
+        return wildcardQuery(value, method, false, true, context);
+    }
+
+    /**
+     * return a wildcard query
+     *
+     * @param value the pattern
+     * @param method rewrite method
+     * @param caseInsensitive should ignore case; note, only used if there is no analyzer, else we use the analyzer rules
+     * @param normalizeIfAnalyzed force normalize casing if an analyzer is used
+     * @param context the query shard context
+     */
+    public Query wildcardQuery(
+        String value,
+        MultiTermQuery.RewriteMethod method,
+        boolean caseInsensitive,
+        boolean normalizeIfAnalyzed,
+        QueryShardContext context
+    ) {
         failIfNotIndexed();
         if (context.allowExpensiveQueries() == false) {
             throw new OpenSearchException(
@@ -162,7 +188,7 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
         }
 
         Term term;
-        if (getTextSearchInfo().getSearchAnalyzer() != null) {
+        if (getTextSearchInfo().getSearchAnalyzer() != null && normalizeIfAnalyzed) {
             value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
             term = new Term(name(), value);
         } else {

diff --git a/server/src/main/java/org/opensearch/index/search/QueryStringQueryParser.java b/server/src/main/java/org/opensearch/index/search/QueryStringQueryParser.java
@@ -729,7 +729,8 @@ private Query getWildcardQuerySingle(String field, String termStr) throws ParseE
             if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?"))) {
                 throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
             }
-            return currentFieldType.wildcardQuery(termStr, getMultiTermRewriteMethod(), context);
+            // query string query is always normalized
+            return currentFieldType.normalizedWildcardQuery(termStr, getMultiTermRewriteMethod(), context);
         } catch (RuntimeException e) {
             if (lenient) {
                 return newLenientFieldQuery(field, e);

diff --git a/server/src/test/java/org/opensearch/index/query/PrefixQueryBuilderTests.java b/server/src/test/java/org/opensearch/index/query/PrefixQueryBuilderTests.java
@@ -130,7 +130,7 @@ public void testNumeric() throws Exception {
         QueryShardContext context = createShardContext();
         QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context));
         assertEquals(
-            "Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
+            "Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
             e.getMessage()
         );
     }

diff --git a/server/src/test/java/org/opensearch/index/query/QueryStringQueryBuilderTests.java b/server/src/test/java/org/opensearch/index/query/QueryStringQueryBuilderTests.java
@@ -873,7 +873,7 @@ public void testPrefixNumeric() throws Exception {
         QueryShardContext context = createShardContext();
         QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context));
         assertEquals(
-            "Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
+            "Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
             e.getMessage()
         );
         query.lenient(true);