Merge remote-tracking branch 'origin' into rescore-context

opensearch-project · Apr 25, 2024 · 3a9da75 · 3a9da75
2 parents 69c3b0d + b277b07
commit 3a9da75
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
 - Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
 - Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
+- Removed stream.findFirst implementation to use more native iteration implement to improve hybrid query latencies by 35% ([#706](https://github.com/opensearch-project/neural-search/pull/706))
 ### Bug Fixes
 - Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663))
 ### Infrastructure

diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java
@@ -14,6 +14,7 @@
 import java.util.Map;
 import java.util.Objects;
 
+import com.google.common.primitives.Ints;
 import org.apache.lucene.search.DisiPriorityQueue;
 import org.apache.lucene.search.DisiWrapper;
 import org.apache.lucene.search.DisjunctionDISIApproximation;
@@ -42,7 +43,7 @@ public final class HybridQueryScorer extends Scorer {
 
     private final float[] subScores;
 
-    private final Map<Query, List<Integer>> queryToIndex;
+    private final Map<Query, int[]> queryToIndex;
 
     private final DocIdSetIterator approximation;
     private final HybridScoreBlockBoundaryPropagator disjunctionBlockPropagator;
@@ -201,48 +202,55 @@ public float[] hybridScores() throws IOException {
                 continue;
             }
             Query query = scorer.getWeight().getQuery();
-            List<Integer> indexes = queryToIndex.get(query);
+            int[] indexes = queryToIndex.get(query);
             // we need to find the index of first sub-query that hasn't been set yet. Such score will have initial value of "0.0"
-            int index = indexes.stream()
-                .mapToInt(idx -> idx)
-                .filter(idx -> Float.compare(scores[idx], 0.0f) == 0)
-                .findFirst()
-                .orElseThrow(
-                    () -> new IllegalStateException(
-                        String.format(
-                            Locale.ROOT,
-                            "cannot set score for one of hybrid search subquery [%s] and document [%d]",
-                            query.toString(),
-                            scorer.docID()
-                        )
+            int index = -1;
+            for (int idx : indexes) {
+                if (Float.compare(scores[idx], 0.0f) == 0) {
+                    index = idx;
+                    break;
+                }
+            }
+            if (index == -1) {
+                throw new IllegalStateException(
+                    String.format(
+                        Locale.ROOT,
+                        "cannot set score for one of hybrid search subquery [%s] and document [%d]",
+                        query.toString(),
+                        scorer.docID()
                     )
                 );
+            }
             scores[index] = scorer.score();
         }
         return scores;
     }
 
-    private Map<Query, List<Integer>> mapQueryToIndex() {
-        Map<Query, List<Integer>> queryToIndex = new HashMap<>();
+    private Map<Query, int[]> mapQueryToIndex() {
+        // we need list as number of identical queries is unknown
+        Map<Query, List<Integer>> queryToListOfIndexes = new HashMap<>();
         int idx = 0;
         for (Scorer scorer : subScorers) {
             if (scorer == null) {
                 idx++;
                 continue;
             }
             Query query = scorer.getWeight().getQuery();
-            queryToIndex.putIfAbsent(query, new ArrayList<>());
-            queryToIndex.get(query).add(idx);
+            queryToListOfIndexes.putIfAbsent(query, new ArrayList<>());
+            queryToListOfIndexes.get(query).add(idx);
             idx++;
         }
+        // convert to the int array for better performance
+        Map<Query, int[]> queryToIndex = new HashMap<>();
+        queryToListOfIndexes.forEach((key, value) -> queryToIndex.put(key, Ints.toArray(value)));
         return queryToIndex;
     }
 
     private DisiPriorityQueue initializeSubScorersPQ() {
         Objects.requireNonNull(queryToIndex, "should not be null");
         Objects.requireNonNull(subScorers, "should not be null");
         // we need to count this way in order to include all identical sub-queries
-        int numOfSubQueries = queryToIndex.values().stream().map(List::size).reduce(0, Integer::sum);
+        int numOfSubQueries = queryToIndex.values().stream().map(array -> array.length).reduce(0, Integer::sum);
         DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numOfSubQueries);
         for (Scorer scorer : subScorers) {
             if (scorer == null) {

diff --git a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQuery.java b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQuery.java
@@ -5,6 +5,7 @@
 package org.opensearch.neuralsearch.query;
 
 import java.io.IOException;
+import java.util.Locale;
 import java.util.Objects;
 
 import lombok.AllArgsConstructor;
@@ -39,6 +40,7 @@ public final class NeuralSparseQuery extends Query {
     @Override
     public String toString(String field) {
         return String.format(
+            Locale.ROOT,
             "NeuralSparseQuery(%s,%s,%s)",
             currentQuery.toString(field),
             highScoreTokenQuery.toString(field),

diff --git a/src/main/java/org/opensearch/neuralsearch/search/util/NeuralSparseTwoPhaseUtil.java b/src/main/java/org/opensearch/neuralsearch/search/util/NeuralSparseTwoPhaseUtil.java
@@ -16,6 +16,7 @@
 
 import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 
 import static java.lang.Float.max;
@@ -50,6 +51,7 @@ public static void addRescoreContextFromNeuralSparseQuery(final Query query, fin
         if (curWindowSize < 0 || curWindowSize > NeuralSparseTwoPhaseParameters.MAX_WINDOW_SIZE) {
             throw new IllegalArgumentException(
                 String.format(
+                    Locale.ROOT,
                     "Two phase final windowSize %d out of score with limit %d. "
                         + "You can change the value of cluster setting [plugins.neural_search.neural_sparse.two_phase.max_window_size] "
                         + "to a integer at least 50.",

diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java
@@ -1021,13 +1021,11 @@ public void testDoToQuery_whenTwoPhaseParaEmpty_thenDegradeSuccess() {
         assertTrue(query instanceof BooleanQuery);
         List<BooleanClause> booleanClauseList = ((BooleanQuery) query).clauses();
         assertEquals(2, ((BooleanQuery) query).clauses().size());
-        BooleanClause firstClause = booleanClauseList.get(0);
-        BooleanClause secondClause = booleanClauseList.get(1);
-
-        Query firstFeatureQuery = firstClause.getQuery();
-        assertEquals(firstFeatureQuery, FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f));
-        Query secondFeatureQuery = secondClause.getQuery();
-        assertEquals(secondFeatureQuery, FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f));
+        List<Query> actualQueries = booleanClauseList.stream().map(BooleanClause::getQuery).collect(Collectors.toList());
+        Query expectedQuery1 = FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f);
+        Query expectedQuery2 = FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f);
+        assertTrue("Expected query for 'world' not found", actualQueries.contains(expectedQuery1));
+        assertTrue("Expected query for 'hello' not found", actualQueries.contains(expectedQuery2));
     }
 
     @SneakyThrows

diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryTests.java
@@ -43,7 +43,7 @@ public void testToStringMethod() {
             + currentQuery.toString()
             + ','
             + highScoreTokenQuery.toString()
-            + ", "
+            + ","
             + lowScoreTokenQuery.toString()
             + ")";
         assertEquals(expectedString, neuralSparseQuery.toString());