Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into rescore-context
Browse files Browse the repository at this point in the history
  • Loading branch information
conggguan committed Apr 25, 2024
2 parents 69c3b0d + b277b07 commit 3a9da75
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
- Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
- Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
- Removed stream.findFirst implementation to use more native iteration implement to improve hybrid query latencies by 35% ([#706](https://github.com/opensearch-project/neural-search/pull/706))
### Bug Fixes
- Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663))
### Infrastructure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.Map;
import java.util.Objects;

import com.google.common.primitives.Ints;
import org.apache.lucene.search.DisiPriorityQueue;
import org.apache.lucene.search.DisiWrapper;
import org.apache.lucene.search.DisjunctionDISIApproximation;
Expand Down Expand Up @@ -42,7 +43,7 @@ public final class HybridQueryScorer extends Scorer {

private final float[] subScores;

private final Map<Query, List<Integer>> queryToIndex;
private final Map<Query, int[]> queryToIndex;

private final DocIdSetIterator approximation;
private final HybridScoreBlockBoundaryPropagator disjunctionBlockPropagator;
Expand Down Expand Up @@ -201,48 +202,55 @@ public float[] hybridScores() throws IOException {
continue;
}
Query query = scorer.getWeight().getQuery();
List<Integer> indexes = queryToIndex.get(query);
int[] indexes = queryToIndex.get(query);
// we need to find the index of first sub-query that hasn't been set yet. Such score will have initial value of "0.0"
int index = indexes.stream()
.mapToInt(idx -> idx)
.filter(idx -> Float.compare(scores[idx], 0.0f) == 0)
.findFirst()
.orElseThrow(
() -> new IllegalStateException(
String.format(
Locale.ROOT,
"cannot set score for one of hybrid search subquery [%s] and document [%d]",
query.toString(),
scorer.docID()
)
int index = -1;
for (int idx : indexes) {
if (Float.compare(scores[idx], 0.0f) == 0) {
index = idx;
break;
}
}
if (index == -1) {
throw new IllegalStateException(
String.format(
Locale.ROOT,
"cannot set score for one of hybrid search subquery [%s] and document [%d]",
query.toString(),
scorer.docID()
)
);
}
scores[index] = scorer.score();
}
return scores;
}

private Map<Query, List<Integer>> mapQueryToIndex() {
Map<Query, List<Integer>> queryToIndex = new HashMap<>();
private Map<Query, int[]> mapQueryToIndex() {
// we need list as number of identical queries is unknown
Map<Query, List<Integer>> queryToListOfIndexes = new HashMap<>();
int idx = 0;
for (Scorer scorer : subScorers) {
if (scorer == null) {
idx++;
continue;
}
Query query = scorer.getWeight().getQuery();
queryToIndex.putIfAbsent(query, new ArrayList<>());
queryToIndex.get(query).add(idx);
queryToListOfIndexes.putIfAbsent(query, new ArrayList<>());
queryToListOfIndexes.get(query).add(idx);
idx++;
}
// convert to the int array for better performance
Map<Query, int[]> queryToIndex = new HashMap<>();
queryToListOfIndexes.forEach((key, value) -> queryToIndex.put(key, Ints.toArray(value)));
return queryToIndex;
}

private DisiPriorityQueue initializeSubScorersPQ() {
Objects.requireNonNull(queryToIndex, "should not be null");
Objects.requireNonNull(subScorers, "should not be null");
// we need to count this way in order to include all identical sub-queries
int numOfSubQueries = queryToIndex.values().stream().map(List::size).reduce(0, Integer::sum);
int numOfSubQueries = queryToIndex.values().stream().map(array -> array.length).reduce(0, Integer::sum);
DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numOfSubQueries);
for (Scorer scorer : subScorers) {
if (scorer == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package org.opensearch.neuralsearch.query;

import java.io.IOException;
import java.util.Locale;
import java.util.Objects;

import lombok.AllArgsConstructor;
Expand Down Expand Up @@ -39,6 +40,7 @@ public final class NeuralSparseQuery extends Query {
@Override
public String toString(String field) {
return String.format(
Locale.ROOT,
"NeuralSparseQuery(%s,%s,%s)",
currentQuery.toString(field),
highScoreTokenQuery.toString(field),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import static java.lang.Float.max;
Expand Down Expand Up @@ -50,6 +51,7 @@ public static void addRescoreContextFromNeuralSparseQuery(final Query query, fin
if (curWindowSize < 0 || curWindowSize > NeuralSparseTwoPhaseParameters.MAX_WINDOW_SIZE) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Two phase final windowSize %d out of score with limit %d. "
+ "You can change the value of cluster setting [plugins.neural_search.neural_sparse.two_phase.max_window_size] "
+ "to a integer at least 50.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1021,13 +1021,11 @@ public void testDoToQuery_whenTwoPhaseParaEmpty_thenDegradeSuccess() {
assertTrue(query instanceof BooleanQuery);
List<BooleanClause> booleanClauseList = ((BooleanQuery) query).clauses();
assertEquals(2, ((BooleanQuery) query).clauses().size());
BooleanClause firstClause = booleanClauseList.get(0);
BooleanClause secondClause = booleanClauseList.get(1);

Query firstFeatureQuery = firstClause.getQuery();
assertEquals(firstFeatureQuery, FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f));
Query secondFeatureQuery = secondClause.getQuery();
assertEquals(secondFeatureQuery, FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f));
List<Query> actualQueries = booleanClauseList.stream().map(BooleanClause::getQuery).collect(Collectors.toList());
Query expectedQuery1 = FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f);
Query expectedQuery2 = FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f);
assertTrue("Expected query for 'world' not found", actualQueries.contains(expectedQuery1));
assertTrue("Expected query for 'hello' not found", actualQueries.contains(expectedQuery2));
}

@SneakyThrows
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public void testToStringMethod() {
+ currentQuery.toString()
+ ','
+ highScoreTokenQuery.toString()
+ ", "
+ ","
+ lowScoreTokenQuery.toString()
+ ")";
assertEquals(expectedString, neuralSparseQuery.toString());
Expand Down

0 comments on commit 3a9da75

Please sign in to comment.