Skip to content

Commit

Permalink
Make doc and query count configurable in benchmark (#270)
Browse files Browse the repository at this point in the history
Makes the document and query count configurable in the benchmarking
tool. With this functionality, users can now specify to only index or
search a subset of the vectors in the data set. This is useful for
indices that require training that may only need a subset of the data set
for training.

Signed-off-by: John Mazanec <jmazane@amazon.com>
  • Loading branch information
jmazanec15 authored Jan 27, 2022
1 parent 6e859f5 commit 76ec5cd
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 14 deletions.
6 changes: 4 additions & 2 deletions benchmarks/perf-tool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,9 @@ Ingests a dataset of vectors into the cluster.
| index_name | Name of index to ingest into | No default |
| field_name | Name of field to ingest into | No default |
| bulk_size | Documents per bulk request | 300 |
| dataset_format | Format the dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
| dataset_path | Path to dataset | No default |
| dataset_format | Format the data-set is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
| dataset_path | Path to data-set | No default |
| doc_count | Number of documents to create from data-set | Size of the data-set |

##### Metrics

Expand All @@ -245,6 +246,7 @@ Runs a set of queries against an index.
| dataset_path | Path to dataset | No default |
| neighbors_format | Format the neighbors dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
| neighbors_path | Path to neighbors dataset | No default |
| query_count | Number of queries to create from data-set | Size of the data-set |

##### Metrics

Expand Down
30 changes: 18 additions & 12 deletions benchmarks/perf-tool/okpt/test/steps/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,21 +278,23 @@ def __init__(self, step_config: StepConfig):
self.dataset = parse_dataset(dataset_format, dataset_path,
Context.INDEX)

input_doc_count = parse_int_param('doc_count', step_config.config, {},
self.dataset.size())
self.doc_count = min(input_doc_count, self.dataset.size())

def _action(self):

def action(doc_id):
return {'index': {'_index': self.index_name, '_id': doc_id}}

id = 0
index_responses = []
while True:
for i in range(0, self.doc_count, self.bulk_size):
partition = self.dataset.read(self.bulk_size)
if partition is None:
break
body = bulk_transform(partition, self.field_name, action, id)
body = bulk_transform(partition, self.field_name, action, i)
result = bulk_index(self.opensearch, self.index_name, body)
index_responses.append(result)
id += self.bulk_size

self.dataset.reset()

Expand Down Expand Up @@ -324,6 +326,11 @@ def __init__(self, step_config: StepConfig):
self.dataset = parse_dataset(dataset_format, dataset_path,
Context.QUERY)

input_query_count = parse_int_param('query_count',
step_config.config, {},
self.dataset.size())
self.query_count = min(input_query_count, self.dataset.size())

neighbors_format = parse_string_param('neighbors_format',
step_config.config, {}, 'hdf5')
neighbors_path = parse_string_param('neighbors_path',
Expand All @@ -349,7 +356,7 @@ def get_body(vec):

results = {}
query_responses = []
while True:
for _ in range(self.query_count):
query = self.dataset.read(1)
if query is None:
break
Expand All @@ -367,10 +374,10 @@ def get_body(vec):
for hit in query_response['hits']['hits']]
for query_response in query_responses]
results['recall@K'] = recall_at_r(ids, self.neighbors,
self.k, self.k)
self.k, self.k, self.query_count)
self.neighbors.reset()
results[f'recall@{str(self.r)}'] = recall_at_r(
ids, self.neighbors, self.r, self.k)
ids, self.neighbors, self.r, self.k, self.query_count)
self.neighbors.reset()

self.dataset.reset()
Expand Down Expand Up @@ -473,7 +480,7 @@ def get_opensearch_client(endpoint: str, port: int):
)


def recall_at_r(results, neighbor_dataset, r, k):
def recall_at_r(results, neighbor_dataset, r, k, query_count):
"""
Calculates the recall@R for a set of queries against a ground truth nearest
neighbor set
Expand All @@ -486,22 +493,21 @@ def recall_at_r(results, neighbor_dataset, r, k):
r: number of top results to check if they are in the ground truth k-NN
set.
k: k value for the query
query_count: number of queries
Returns:
Recall at R
"""
correct = 0.0
query = 0
while True:
for query in range(query_count):
true_neighbors = neighbor_dataset.read(1)
if true_neighbors is None:
break
true_neighbors_set = set(true_neighbors[0][:k])
for j in range(r):
if results[query][j] in true_neighbors_set:
correct += 1.0
query += 1

return correct / (r * neighbor_dataset.size())
return correct / (r * query_count)


def get_index_size_in_kb(opensearch, index_name):
Expand Down

0 comments on commit 76ec5cd

Please sign in to comment.