-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add vector search with embedding generation workload (#232)
* Add vector search with embedding generation workload Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * Add vector search with embedding generation workload Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * Updated README.md with the license text. Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * - Changed the workload form vectorsearch_embedding to semantic_search. - Changed dataset from ms marco to trec-covid. - Moved benchmark task runners DeletePipeline, DeleteMlModel, RegisterMlModel and DeployMlModel to OS-benchmark repo. Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * - Changed the workload name semantic_search to treccovid_semantic_search. - Added the sample output for treccovid_semantic_search. - Added description of test procedure. - Simplified treccovid_semantics_search workload configuration. Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * Updated parameters of treccovid workload. Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * Added files.txt to treccovid workload. Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> * Updated the documents url for treccovid_semantic_search. Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> --------- Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com> (cherry picked from commit 417170f) Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
- Loading branch information
1 parent
5be2380
commit 250afc1
Showing
8 changed files
with
596 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
documents.json.bz2 | ||
queries.json.bz2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"settings": { | ||
{%-if number_of_shards is defined %} | ||
"index.number_of_shards": {{number_of_shards}}, | ||
{%- endif %} | ||
{%-if number_of_replicas is defined %} | ||
"index.number_of_replicas": {{number_of_replicas}}, | ||
{%- endif %} | ||
"index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}}, | ||
"index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}}, | ||
"index.knn": true, | ||
"default_pipeline": "nlp-ingest-pipeline" | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"id": { | ||
"type": "text" | ||
}, | ||
"passage_embedding": { | ||
"type": "knn_vector", | ||
"dimension": {{dimensions | default(768)}}, | ||
"method": { | ||
{%-if engine is defined %} | ||
"engine": "{{engine}}", | ||
{%- endif %} | ||
"space_type": "{{space_type | default('l2')}}", | ||
"name": "{{method | default('hnsw')}}", | ||
"parameters": {} | ||
} | ||
}, | ||
"text": { | ||
"type": "text" | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
{ | ||
"name": "delete-ingest-pipeline", | ||
"operation-type": "delete-pipeline", | ||
"id": "nlp-ingest-pipeline" | ||
}, | ||
{ | ||
"name": "create-ingest-pipeline", | ||
"operation-type": "put-pipeline", | ||
"param-source": "create-ingest-pipeline", | ||
"id": "nlp-ingest-pipeline", | ||
"body": { | ||
"description": "An NLP ingest pipeline", | ||
"processors": [ | ||
{ | ||
"text_embedding": { | ||
"model_id": "", | ||
"field_map": { | ||
"text": "passage_embedding" | ||
} | ||
} | ||
} | ||
] | ||
} | ||
}, | ||
{ | ||
"name": "index-append", | ||
"operation-type": "bulk", | ||
"bulk-size": {{bulk_size | default(100)}}, | ||
"ingest-percentage": {{ingest_percentage | default(100)}} | ||
}, | ||
{ | ||
"name": "wait-until-merges-finish", | ||
"operation-type": "index-stats", | ||
"index": "_all", | ||
"condition": { | ||
"path": "_all.total.merges.current", | ||
"expected-value": 0 | ||
}, | ||
"retry-until-success": true, | ||
"include-in-reporting": false | ||
}, | ||
{ | ||
"name": "default", | ||
"operation-type": "search", | ||
"body": { | ||
"query": { | ||
"match_all": {} | ||
} | ||
} | ||
}, | ||
{ | ||
"name": "semantic-search", | ||
"operation-type": "search", | ||
"num-variable-queries": {{num_variable_queries | default(0)}}, | ||
"param-source": "semantic-search-source", | ||
"body": { | ||
"_source": { | ||
"excludes": [ | ||
"passage_embedding" | ||
] | ||
}, | ||
"query": { | ||
"neural": { | ||
"passage_embedding": { | ||
"query_text": "what types of rapid testing for Covid-19 have been developed?", | ||
"model_id": "", | ||
"k": {{k | default(10)}} | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
{ | ||
"name": "index-merge-search", | ||
"description": "Indexes the corpus with vector embedding and then runs queries with vector embedding.", | ||
"default": true, | ||
"schedule": [ | ||
{ | ||
"name": "cluster-settings", | ||
"operation": { | ||
"operation-type": "put-settings", | ||
"body": { | ||
"persistent": { | ||
"plugins": { | ||
"ml_commons": { | ||
"only_run_on_ml_node": "false", | ||
"native_memory_threshold": "99", | ||
"allow_registering_model_via_local_file": "true", | ||
"allow_registering_model_via_url": "true" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
}, | ||
{ | ||
"operation": "delete-index" | ||
}, | ||
{ | ||
"operation": "delete-ingest-pipeline" | ||
}, | ||
{ | ||
"operation": { | ||
"operation-type": "delete-ml-model", | ||
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" | ||
} | ||
}, | ||
{ | ||
"operation": { | ||
"operation-type": "register-ml-model", | ||
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", | ||
"model-version": "{{ model_version | default('1.0.1') }}", | ||
"model-format": "{{ model_format | default('TORCH_SCRIPT') }}", | ||
"model-config-file": "{{ model_config_file | default('') }}" | ||
} | ||
}, | ||
{ | ||
"operation": "deploy-ml-model" | ||
}, | ||
{ | ||
"operation": "create-ingest-pipeline" | ||
}, | ||
{ | ||
"operation": { | ||
"operation-type": "create-index", | ||
"settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { | ||
"index.refresh_interval": "5s", | ||
"index.translog.flush_threshold_size": "1g" | ||
}{%- endif %} | ||
} | ||
}, | ||
{ | ||
"name": "check-cluster-health", | ||
"operation": { | ||
"operation-type": "cluster-health", | ||
"index": "treccovid", | ||
"request-params": { | ||
"wait_for_status": "{{cluster_health | default('green')}}", | ||
"wait_for_no_relocating_shards": "true" | ||
}, | ||
"retry-until-success": true | ||
} | ||
}, | ||
{ | ||
"operation": "index-append", | ||
"warmup-time-period": 60, | ||
"clients": {{bulk_indexing_clients | default(1)}}, | ||
"ignore-response-error-level": "{{error_level | default('non-fatal')}}" | ||
}, | ||
{ | ||
"name": "refresh-after-index", | ||
"operation": "refresh" | ||
}, | ||
{ | ||
"operation": { | ||
"operation-type": "force-merge", | ||
"request-timeout": 7200{%- if force_merge_max_num_segments is defined %}, | ||
"max-num-segments": {{ force_merge_max_num_segments | tojson }} | ||
{%- endif %} | ||
} | ||
}, | ||
{ | ||
"name": "refresh-after-force-merge", | ||
"operation": "refresh" | ||
}, | ||
{ | ||
"operation": "wait-until-merges-finish" | ||
}, | ||
{ | ||
"operation": "default", | ||
"warmup-iterations": {{warmup_iterations | default(500) | tojson}}, | ||
"iterations": {{iterations | default(500) | tojson }}, | ||
"target-throughput": {{ target_throughput | default(100) | tojson}}, | ||
"clients": {{ search_clients | default(1) }} | ||
}, | ||
{ | ||
"operation": "semantic-search", | ||
"warmup-iterations": {{warmup_iterations | default(100) | tojson}}, | ||
"iterations": {{iterations | default(100) | tojson }}, | ||
"target-throughput": {{ target_throughput | default(10) | tojson}}, | ||
"clients": {{ search_clients | default(1)}} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
{% import "benchmark.helpers" as benchmark with context %} | ||
|
||
{ | ||
"version": 2, | ||
"description": "Trec-Covid is a dataset collection of documents about COVID-19 information.", | ||
"indices": [ | ||
{ | ||
"name": "treccovid", | ||
"body": "index.json" | ||
} | ||
], | ||
"corpora": [ | ||
{ | ||
"name": "treccovid", | ||
"base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid", | ||
"documents": [ | ||
{ | ||
"source-file": "documents.json.bz2", | ||
"document-count": 129192, | ||
"compressed-bytes": 51187469, | ||
"uncompressed-bytes": 211980208 | ||
} | ||
] | ||
} | ||
], | ||
"operations": [ | ||
{{ benchmark.collect(parts="operations/*.json") }} | ||
], | ||
"test_procedures": [ | ||
{{ benchmark.collect(parts="test_procedures/*.json") }} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import random | ||
import os | ||
import json | ||
from pathlib import Path | ||
|
||
from osbenchmark.workload.loader import Downloader | ||
from osbenchmark.workload.loader import Decompressor | ||
from osbenchmark.workload.loader import Decompressor | ||
|
||
script_dir = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
def ingest_pipeline_param_source(workload, params, **kwargs): | ||
model_id = params['body']['processors'][0]['text_embedding']['model_id'] | ||
if not model_id: | ||
with open('model_id.json') as f: | ||
d = json.loads(f.read()) | ||
model_id = d['model_id'] | ||
params['body']['processors'][0]['text_embedding']['model_id'] = model_id | ||
return params | ||
|
||
class QueryParamSource: | ||
def __init__(self, workload, params, **kwargs): | ||
if len(workload.indices) == 1: | ||
index = workload.indices[0].name | ||
if len(workload.indices[0].types) == 1: | ||
type = workload.indices[0].types[0].name | ||
else: | ||
type = None | ||
else: | ||
index = "_all" | ||
type = None | ||
|
||
self._params = params | ||
self._params['index'] = index | ||
self._params['type'] = type | ||
self._params['variable-queries'] = params.get("variable-queries", 0) | ||
self.infinite = True | ||
|
||
if self._params['variable-queries'] > 0: | ||
with open(script_dir + os.sep + 'workload_queries.json', 'r') as f: | ||
d = json.loads(f.read()) | ||
source_file = d['source-file'] | ||
base_url = d['base-url'] | ||
compressed_bytes = d['compressed-bytes'] | ||
uncompressed_bytes = d['uncompressed-bytes'] | ||
compressed_path = script_dir + os.sep + source_file | ||
uncompressed_path = script_dir + os.sep + Path(source_file).stem | ||
if not os.path.exists(compressed_path): | ||
downloader = Downloader(False, False) | ||
downloader.download(base_url, None, compressed_path, compressed_bytes) | ||
if not os.path.exists(uncompressed_path): | ||
decompressor = Decompressor() | ||
decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) | ||
|
||
def partition(self, partition_index, total_partitions): | ||
return self | ||
|
||
def params(self): | ||
params = self._params | ||
with open('model_id.json', 'r') as f: | ||
d = json.loads(f.read()) | ||
params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id'] | ||
count = self._params.get("variable-queries", 0) | ||
if count > 0: | ||
script_dir = os.path.dirname(os.path.realpath(__file__)) | ||
with open(script_dir + '/queries.json', 'r') as f: | ||
lines = f.read().splitlines() | ||
line =random.choice(lines) | ||
query_text = json.loads(line)['text'] | ||
params['body']['query']['neural']['passage_embedding']['query_text'] = query_text | ||
return params | ||
|
||
def register(registry): | ||
registry.register_param_source("semantic-search-source", QueryParamSource) | ||
registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid", | ||
"source-file": "queries.json.bz2", | ||
"compressed-bytes": 4310, | ||
"uncompressed-bytes": 16552 | ||
} |