Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test: bwc test for text chunking processor #661

Merged
merged 29 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
fb8b5e8
bwc test for text chunking processor
yuye-aws Apr 2, 2024
67dfb19
spotless apply
yuye-aws Apr 2, 2024
5bea529
update changelog
yuye-aws Apr 2, 2024
af356a1
spotless apply
yuye-aws Apr 2, 2024
476178f
add test document for restart upgrade
yuye-aws Apr 2, 2024
e3ae036
rename pipeline configuration file
yuye-aws Apr 2, 2024
3063ea0
fix pipeline create bug
yuye-aws Apr 2, 2024
26b55cc
fix pipeline create bug
yuye-aws Apr 2, 2024
50b124a
filter tests for lower versions
yuye-aws Apr 2, 2024
d9aa2fe
index create in chunking bwc test
yuye-aws Apr 2, 2024
604e451
index create in chunking bwc test
yuye-aws Apr 2, 2024
46dbcab
index create in chunking bwc test
yuye-aws Apr 2, 2024
93f2de2
index validate in chunking bwc test
yuye-aws Apr 2, 2024
adcceb2
filter bwc test for lower version
yuye-aws Apr 3, 2024
4295987
bug fix in document ingestion in text chunking test
yuye-aws Apr 3, 2024
86ed5b8
ensure index creation in text chunking bwc test
yuye-aws Apr 3, 2024
6085bb0
add comment
yuye-aws Apr 3, 2024
b28b4dc
update index setting
yuye-aws Apr 3, 2024
70804e0
update change log
yuye-aws Apr 10, 2024
f7d8a2e
update gradle comment format
yuye-aws Apr 10, 2024
0714737
update gradle file format
yuye-aws Apr 10, 2024
1c2dec5
rename bwc test filename
yuye-aws Apr 10, 2024
c392b4c
update gradle file format
yuye-aws Apr 10, 2024
353baac
update gradle file to filter tests
yuye-aws Apr 10, 2024
647c630
merge method createPipelineProcessorWithoutModelId
yuye-aws Apr 10, 2024
643ae96
text chunking processor it: create pipeline method rename
yuye-aws Apr 10, 2024
9a8ead6
fix it failure
yuye-aws Apr 10, 2024
f189c11
include index mapping for text chunking index setting
yuye-aws Apr 12, 2024
9b2334e
update nitpicking
yuye-aws Apr 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.13...2.x)
### Features
### Enhancements
- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
- Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
### Bug Fixes
- Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663))
Expand Down
18 changes: 16 additions & 2 deletions qa/restart-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
systemProperty 'tests.skip_delete_model_index', 'true'
systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version

//Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
vibrantvarun marked this conversation as resolved.
Show resolved Hide resolved
// because these features were released in 2.11 version.
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
filter {
Expand All @@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand All @@ -107,7 +114,7 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
systemProperty 'tests.is_old_cluster', 'false'
systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version

//Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// because these features were released in 2.11 version.
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
filter {
Expand All @@ -125,6 +132,13 @@ task testAgainstNewCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import org.junit.Before;
import org.opensearch.common.settings.Settings;
Expand Down Expand Up @@ -99,4 +101,11 @@ protected void createPipelineForSparseEncodingProcessor(final String modelId, fi
);
createPipelineProcessor(requestBody, pipelineName, modelId);
}

protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make this identical to methods written above? Apart from this look good to me.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

String requestBody = Files.readString(
Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
);
createPipelineProcessor(requestBody, pipelineName, "");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import org.opensearch.index.query.MatchAllQueryBuilder;
import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;

public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase {

private static final String PIPELINE_NAME = "pipeline-text-chunking";
private static final String INPUT_FIELD = "body";
private static final String OUTPUT_FIELD = "body_chunk";
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
private static final String TEST_INGEST_TEXT =
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);

// Test rolling-upgrade text chunking processor
// Create Text Chunking Processor, Ingestion Pipeline and add document
// Validate process, pipeline and document count in restart-upgrade scenario
public void testTextChunkingProcessor_E2EFlow() throws Exception {
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
String indexName = getIndexNameForTest();
if (isRunningAgainstOldCluster()) {
createPipelineForTextChunkingProcessor(PIPELINE_NAME);
createChunkingIndex(indexName);
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
validateTestIndex(indexName, OUTPUT_FIELD, 1, expectedPassages);
} else {
try {
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
validateTestIndex(indexName, OUTPUT_FIELD, 2, expectedPassages);
} finally {
wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
}
}
}

private void createChunkingIndex(String indexName) throws Exception {
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
Objects.requireNonNull(documentURLPath);
String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
}

private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
int docCount = getDocCount(indexName);
assertEquals(documentCount, docCount);
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Map<String, Object> searchResults = search(indexName, query, 10);
assertNotNull(searchResults);
Map<String, Object> document = getFirstInnerHit(searchResults);
assertNotNull(document);
Object documentSource = document.get("_source");
assert (documentSource instanceof Map);
@SuppressWarnings("unchecked")
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
assert (documentSourceMap).containsKey(fieldName);
Object ingestOutputs = documentSourceMap.get(fieldName);
assertEquals(expected, ingestOutputs);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"settings":{
"default_pipeline": "%s",
"number_of_shards": 3,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"body": {
"type": "text"
},
"body_chunk": {
"type": "text"
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"description": "An example fixed token length chunker pipeline with standard tokenizer",
"processors" : [
{
"text_chunking": {
"field_map": {
"body": "body_chunk"
},
"algorithm": {
"fixed_token_length": {
"token_limit": 10,
"tokenizer": "standard"
}
}
}
}
]
}
30 changes: 29 additions & 1 deletion qa/rolling-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down Expand Up @@ -126,6 +133,13 @@ task testAgainstOneThirdUpgradedCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand All @@ -150,7 +164,7 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
systemProperty 'tests.skip_delete_model_index', 'true'
systemProperty 'tests.plugin_bwc_version', ext.neural_search_bwc_version

//Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
// Excluding MultiModalSearchIT, HybridSearchIT, NeuralSparseSearchIT, NeuralQueryEnricherProcessorIT tests from neural search version 2.9 and 2.10
vibrantvarun marked this conversation as resolved.
Show resolved Hide resolved
// because these features were released in 2.11 version.
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")){
filter {
Expand All @@ -168,6 +182,13 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down Expand Up @@ -210,6 +231,13 @@ task testRollingUpgrade(type: StandaloneRestIntegTestTask) {
}
}

// Excluding the text chunking processor test because we introduce this feature in 2.13
if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
filter {
excludeTestsMatching "org.opensearch.neuralsearch.bwc.TextChunkingProcessorIT.*"
}
}

nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
systemProperty 'tests.security.manager', 'false'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import org.junit.Before;
import org.opensearch.common.settings.Settings;
Expand Down Expand Up @@ -130,4 +132,11 @@ protected void createPipelineForSparseEncodingProcessor(String modelId, String p
);
createPipelineProcessor(requestBody, pipelineName, modelId);
}

protected void createPipelineForTextChunkingProcessor(String pipelineName) throws Exception {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, maybe we can have a utility class for common methods like this

String requestBody = Files.readString(
Path.of(classLoader.getResource("processor/PipelineForTextChunkingProcessorConfiguration.json").toURI())
);
createPipelineProcessor(requestBody, pipelineName, "");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class NeuralSparseSearchIT extends AbstractRollingUpgradeTestCase {

// Test rolling-upgrade test sparse embedding processor
// Create Sparse Encoding Processor, Ingestion Pipeline and add document
// Validate process , pipeline and document count in restart-upgrade scenario
// Validate process , pipeline and document count in rolling-upgrade scenario
public void testSparseEncodingProcessor_E2EFlow() throws Exception {
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
switch (getClusterType()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.bwc;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import org.opensearch.index.query.MatchAllQueryBuilder;
import static org.opensearch.neuralsearch.TestUtils.NODES_BWC_CLUSTER;

public class TextChunkingProcessorIT extends AbstractRollingUpgradeTestCase {

private static final String PIPELINE_NAME = "pipeline-text-chunking";
private static final String INPUT_FIELD = "body";
private static final String OUTPUT_FIELD = "body_chunk";
private static final String TEST_INDEX_SETTING_PATH = "processor/ChunkingIndexSettings.json";
private static final int NUM_DOCS_PER_ROUND = 1;
private static final String TEST_INGEST_TEXT =
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";

List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);

// Test rolling-upgrade text chunking processor
// Create Text Chunking Processor, Ingestion Pipeline and add document
// Validate process, pipeline and document count in rolling-upgrade scenario
public void testTextChunkingProcessor_E2EFlow() throws Exception {
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
String indexName = getIndexNameForTest();
switch (getClusterType()) {
case OLD:
createPipelineForTextChunkingProcessor(PIPELINE_NAME);
createChunkingIndex(indexName);
addDocument(indexName, "0", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
break;
case MIXED:
int totalDocsCountMixed;
if (isFirstMixedRound()) {
totalDocsCountMixed = NUM_DOCS_PER_ROUND;
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages);
yuye-aws marked this conversation as resolved.
Show resolved Hide resolved
addDocument(indexName, "1", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
} else {
totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND;
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountMixed, expectedPassages);
yuye-aws marked this conversation as resolved.
Show resolved Hide resolved
}
break;
case UPGRADED:
try {
int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND;
addDocument(indexName, "2", INPUT_FIELD, TEST_INGEST_TEXT, null, null);
validateTestIndex(indexName, OUTPUT_FIELD, totalDocsCountUpgraded, expectedPassages);
} finally {
wipeOfTestResources(indexName, PIPELINE_NAME, null, null);
}
break;
default:
throw new IllegalStateException("Unexpected value: " + getClusterType());
}
}

private void createChunkingIndex(String indexName) throws Exception {
URL documentURLPath = classLoader.getResource(TEST_INDEX_SETTING_PATH);
Objects.requireNonNull(documentURLPath);
String indexSetting = Files.readString(Path.of(documentURLPath.toURI()));
createIndexWithConfiguration(indexName, indexSetting, PIPELINE_NAME);
}

private void validateTestIndex(String indexName, String fieldName, int documentCount, Object expected) {
int docCount = getDocCount(indexName);
assertEquals(documentCount, docCount);
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Map<String, Object> searchResults = search(indexName, query, 10);
assertNotNull(searchResults);
Map<String, Object> document = getFirstInnerHit(searchResults);
assertNotNull(document);
Object documentSource = document.get("_source");
assert (documentSource instanceof Map);
@SuppressWarnings("unchecked")
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
assert (documentSourceMap).containsKey(fieldName);
Object ingestOutputs = documentSourceMap.get(fieldName);
assertEquals(expected, ingestOutputs);
}
}
Loading
Loading