Merge branch 'main' into BugTestOther

Signed-off-by: Yuye Zhu <yuyezhu@amazon.com>
opensearch-project · Apr 23, 2024 · 5c32d69 · 5c32d69
2 parents 9e56e0c + 86f6d4c
commit 5c32d69
Show file tree

Hide file tree

Showing 54 changed files with 3,543 additions and 480 deletions.
diff --git a/.github/workflows/test_aggregations.yml b/.github/workflows/test_aggregations.yml
@@ -0,0 +1,71 @@
+name: Run Additional Tests for Neural Search
+on:
+  schedule:
+    - cron: '0 0 * * *'  # every night
+  push:
+    branches:
+      - "*"
+      - "feature/**"
+  pull_request:
+    branches:
+      - "*"
+      - "feature/**"
+
+jobs:
+  Get-CI-Image-Tag:
+    uses: opensearch-project/opensearch-build/.github/workflows/get-ci-image-tag.yml@main
+    with:
+      product: opensearch
+
+  Check-neural-search-linux:
+    needs: Get-CI-Image-Tag
+    strategy:
+      matrix:
+        java: [11, 17, 21]
+        os: [ubuntu-latest]
+
+    name: Integ Tests Linux
+    runs-on: ${{ matrix.os }}
+    container:
+      # using the same image which is used by opensearch-build team to build the OpenSearch Distribution
+      # this image tag is subject to change as more dependencies and updates will arrive over time
+      image: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-version-linux }}
+      # need to switch to root so that github actions can install runner binary on container without permission issues.
+      options: --user root
+
+
+    steps:
+      - name: Checkout neural-search
+        uses: actions/checkout@v1
+
+      - name: Setup Java ${{ matrix.java }}
+        uses: actions/setup-java@v1
+        with:
+          java-version: ${{ matrix.java }}
+
+      - name: Run tests
+        run: |
+          chown -R 1000:1000 `pwd`
+          su `id -un 1000` -c "./gradlew ':integTest' -Dtest_aggs=true --tests \"org.opensearch.neuralsearch.query.aggregation.*IT\""
+
+  Check-neural-search-windows:
+    strategy:
+      matrix:
+        java: [11, 17, 21]
+        os: [windows-latest]
+
+    name: Integ Tests Windows
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout neural-search
+        uses: actions/checkout@v1
+
+      - name: Setup Java ${{ matrix.java }}
+        uses: actions/setup-java@v1
+        with:
+          java-version: ${{ matrix.java }}
+
+      - name: Run tests
+        run: |
+          ./gradlew ':integTest' -Dtest_aggs=true --tests "org.opensearch.neuralsearch.query.aggregation.*IT"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,15 +11,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix typo for sparse encoding processor factory([#578](https://github.com/opensearch-project/neural-search/pull/578))
 - Add non-null check for queryBuilder in NeuralQueryEnricherProcessor ([#615](https://github.com/opensearch-project/neural-search/pull/615))
 - Bug test for other BWC tests
+- Add max_token_score field placeholder in NeuralSparseQueryBuilder to fix the rolling-upgrade from 2.x nodes bwc tests. ([#696](https://github.com/opensearch-project/neural-search/pull/696))
 ### Infrastructure
+- Adding integration tests for scenario of hybrid query with aggregations ([#632](https://github.com/opensearch-project/neural-search/pull/632))
 ### Documentation
 ### Maintenance
 ### Refactoring
 
 ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.13...2.x)
 ### Features
 ### Enhancements
+- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
 - Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
+- Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
 ### Bug Fixes
 - Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663))
 ### Infrastructure

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -31,10 +31,13 @@ To send us a pull request, please:
 
 1. Fork the repository.
 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
-3. Ensure local tests pass.
-4. Commit to your fork using clear commit messages.
-5. Send us a pull request, answering any default questions in the pull request interface.
-6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
+3. Include tests that check your new feature or bug fix. Ideally, we're looking for unit, integration, and BWC tests, but that depends on how big and critical your change is. 
+If you're adding an integration test and it is using local ML models, please make sure that the number of model deployments is limited, and you're using the smallest possible model. 
+Each model deployment consumes resources, and having too many models may cause unexpected test failures.
+4. Ensure local tests pass.
+5. Commit to your fork using clear commit messages.
+6. Send us a pull request, answering any default questions in the pull request interface.
+7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
 
 GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
 [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).

diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -181,6 +181,11 @@ Additionally, to run integration tests on multi nodes with security enabled, run
 ./gradlew :integTest -Dsecurity.enabled=true -PnumNodes=3
 ```
 
+Some integration tests are skipped by default, mainly to save time and resources. A special parameter is required to include those tests in the executed test suite. For example, the following command enables additional tests for aggregations when they are bundled with hybrid queries
+```
+./gradlew :integTest -PnumNodes=3 -Dtest_aggs=true
+```
+
 Integration tests can be run with remote cluster. For that run the following command and replace host/port/cluster name values with ones for the target cluster:
 
 ```

diff --git a/build.gradle b/build.gradle
@@ -308,6 +308,12 @@ task integTest(type: RestIntegTestTask) {
     description = "Run tests against a cluster"
     testClassesDirs = sourceSets.test.output.classesDirs
     classpath = sourceSets.test.runtimeClasspath
+    boolean runCompleteAggsTestSuite = Boolean.parseBoolean(System.getProperty('test_aggs', "false"))
+    if (!runCompleteAggsTestSuite) {
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.query.aggregation.*IT"
+        }
+    }
 }
 tasks.named("check").configure { dependsOn(integTest) }
 

diff --git a/qa/restart-upgrade/build.gradle b/qa/restart-upgrade/build.gradle
@@ -65,7 +65,7 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
     systemProperty 'tests.skip_delete_model_index', 'true'
     systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version
 
-    //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
+    // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
     // because these features were released in 2.11 version.
     if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
         filter {
@@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'
@@ -107,7 +114,7 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
     systemProperty 'tests.is_old_cluster', 'false'
     systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version
 
-    //Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
+    // Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
     // because these features were released in 2.11 version.
     if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
         filter {
@@ -125,6 +132,13 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the text chunking processor test because we introduce this feature in 2.13
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'

diff --git a/...ade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java b/...ade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java
@@ -11,11 +11,11 @@
 import org.junit.Before;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.neuralsearch.BaseNeuralSearchIT;
-import static org.opensearch.neuralsearch.TestUtils.NEURAL_SEARCH_BWC_PREFIX;
-import static org.opensearch.neuralsearch.TestUtils.CLIENT_TIMEOUT_VALUE;
-import static org.opensearch.neuralsearch.TestUtils.RESTART_UPGRADE_OLD_CLUSTER;
-import static org.opensearch.neuralsearch.TestUtils.BWC_VERSION;
-import static org.opensearch.neuralsearch.TestUtils.generateModelId;
+import static org.opensearch.neuralsearch.util.TestUtils.NEURAL_SEARCH_BWC_PREFIX;
+import static org.opensearch.neuralsearch.util.TestUtils.CLIENT_TIMEOUT_VALUE;
+import static org.opensearch.neuralsearch.util.TestUtils.RESTART_UPGRADE_OLD_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.BWC_VERSION;
+import static org.opensearch.neuralsearch.util.TestUtils.generateModelId;
 import org.opensearch.test.rest.OpenSearchRestTestCase;
 
 public abstract class AbstractRestartUpgradeRestTestCase extends BaseNeuralSearchIT {
@@ -99,4 +99,11 @@ protected void createPipelineForSparseEncodingProcessor(final String modelId, fi
         );
         createPipelineProcessor(requestBody, pipelineName, modelId);
     }
+
+    protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
+        String requestBody = Files.readString(
+            Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
+        );
+        createPipelineProcessor(requestBody, pipelineName, "");
+    }
 }
diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/HybridSearchIT.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/HybridSearchIT.java
@@ -11,12 +11,12 @@
 import java.util.List;
 import java.util.Map;
 import org.opensearch.index.query.MatchQueryBuilder;
-import static org.opensearch.neuralsearch.TestUtils.getModelId;
-import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
-import static org.opensearch.neuralsearch.TestUtils.PARAM_NAME_WEIGHTS;
-import static org.opensearch.neuralsearch.TestUtils.TEXT_EMBEDDING_PROCESSOR;
-import static org.opensearch.neuralsearch.TestUtils.DEFAULT_NORMALIZATION_METHOD;
-import static org.opensearch.neuralsearch.TestUtils.DEFAULT_COMBINATION_METHOD;
+import static org.opensearch.neuralsearch.util.TestUtils.getModelId;
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.PARAM_NAME_WEIGHTS;
+import static org.opensearch.neuralsearch.util.TestUtils.TEXT_EMBEDDING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.DEFAULT_NORMALIZATION_METHOD;
+import static org.opensearch.neuralsearch.util.TestUtils.DEFAULT_COMBINATION_METHOD;
 import org.opensearch.neuralsearch.query.HybridQueryBuilder;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
 

diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/MultiModalSearchIT.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/MultiModalSearchIT.java
@@ -7,9 +7,9 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Map;
-import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
-import static org.opensearch.neuralsearch.TestUtils.TEXT_IMAGE_EMBEDDING_PROCESSOR;
-import static org.opensearch.neuralsearch.TestUtils.getModelId;
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.TEXT_IMAGE_EMBEDDING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.getModelId;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
 
 public class MultiModalSearchIT extends AbstractRestartUpgradeRestTestCase {

diff --git a/...upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralQueryEnricherProcessorIT.java b/...upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralQueryEnricherProcessorIT.java
@@ -4,12 +4,12 @@
  */
 package org.opensearch.neuralsearch.bwc;
 
-import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
-import static org.opensearch.neuralsearch.TestUtils.SPARSE_ENCODING_PROCESSOR;
-import static org.opensearch.neuralsearch.TestUtils.TEXT_EMBEDDING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.SPARSE_ENCODING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.TEXT_EMBEDDING_PROCESSOR;
 
 import org.opensearch.common.settings.Settings;
-import org.opensearch.neuralsearch.TestUtils;
+import org.opensearch.neuralsearch.util.TestUtils;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
 import org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder;
 

diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/NeuralSparseSearchIT.java
@@ -10,10 +10,10 @@
 import java.util.Map;
 import org.opensearch.index.query.BoolQueryBuilder;
 import org.opensearch.index.query.MatchQueryBuilder;
-import org.opensearch.neuralsearch.TestUtils;
-import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
-import static org.opensearch.neuralsearch.TestUtils.SPARSE_ENCODING_PROCESSOR;
-import static org.opensearch.neuralsearch.TestUtils.objectToFloat;
+import org.opensearch.neuralsearch.util.TestUtils;
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.SPARSE_ENCODING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.objectToFloat;
 import org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder;
 
 public class NeuralSparseSearchIT extends AbstractRestartUpgradeRestTestCase {

diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/SemanticSearchIT.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/SemanticSearchIT.java
@@ -7,9 +7,9 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Map;
-import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;
-import static org.opensearch.neuralsearch.TestUtils.getModelId;
-import static org.opensearch.neuralsearch.TestUtils.TEXT_EMBEDDING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.getModelId;
+import static org.opensearch.neuralsearch.util.TestUtils.TEXT_EMBEDDING_PROCESSOR;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
 
 public class SemanticSearchIT extends AbstractRestartUpgradeRestTestCase {

diff --git a/...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java b/...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.bwc;
+
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import org.opensearch.index.query.MatchAllQueryBuilder;
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+
+public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase {
+
+    private static final String PIPELINE_NAME = "pipeline-text-chunking";
+    private static final String INPUT_FIELD = "body";
+    private static final String OUTPUT_FIELD = "body_chunk";
+    private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
+    private static final String TEST_INGEST_TEXT =
+        "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
+    List<String> expectedPassages = List.of(
+        "This is an example document to be chunked. The document ",
+        "contains a single paragraph, two sentences and 24 tokens by ",
+        "standard tokenizer in OpenSearch."
+    );
+
+    // Test rolling-upgrade text chunking processor
+    // Create Text Chunking Processor, Ingestion Pipeline and add document
+    // Validate process, pipeline and document count in restart-upgrade scenario
+    public void testTextChunkingProcessor_E2EFlow() throws Exception {
+        waitForClusterHealthGreen(NODES_BWC_CLUSTER);
+        String indexName = getIndexNameForTest();
+        if (isRunningAgainstOldCluster()) {
+            createPipelineForTextChunkingProcessor(PIPELINE_NAME);
+            createChunkingIndex(indexName);
+            addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+            validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages);
+        } else {
+            try {
+                addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
+                validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages);
+            } finally {
+                wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
+            }
+        }
+    }
+
+    private void createChunkingIndex(String indexName) throws Exception {
+        URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
+        Objects.requireNonNull(documentURLPath);
+        String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
+        createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
+    }
+
+    private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
+        int docCount = getDocCount(indexName);
+        assertEquals(documentCount, docCount);
+        MatchAllQueryBuilder query = new MatchAllQueryBuilder();
+        Map<String, Object> searchResults = search(indexName, query, 10);
+        assertNotNull(searchResults);
+        Map<String, Object> document = getFirstInnerHit(searchResults);
+        assertNotNull(document);
+        Object documentSource = document.get("_source");
+        assert (documentSource instanceof Map);
+        @SuppressWarnings("unchecked")
+        Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
+        assert (documentSourceMap).containsKey(fieldName);
+        Object ingestOutputs = documentSourceMap.get(fieldName);
+        assertEquals(expected, ingestOutputs);
+    }
+}
diff --git a/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json b/qa/restart-upgrade/src/test/resources/processor/ChunkingIndexSettings.json
@@ -0,0 +1,17 @@
+{
+  "settings":{
+    "default_pipeline": "%s",
+    "number_of_shards": 3,
+    "number_of_replicas": 1
+  },
+  "mappings": {
+    "properties": {
+      "body": {
+        "type": "text"
+      },
+      "body_chunk": {
+        "type": "text"
+      }
+    }
+  }
+}
diff --git a/...t-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json b/...t-upgrade/src/test/resources/processor/PipelineForTextChunkingProcessorConfiguration.json
@@ -0,0 +1,18 @@
+{
+  "description": "An example fixed token length chunker pipeline with standard tokenizer",
+  "processors" : [
+    {
+      "text_chunking": {
+        "field_map": {
+          "body": "body_chunk"
+        },
+        "algorithm": {
+          "fixed_token_length": {
+            "token_limit": 10,
+            "tokenizer": "standard"
+          }
+        }
+      }
+    }
+  ]
+}