Make doc and query count configurable in benchmark (#270)

Makes the document and query count configurable in the benchmarking tool. With this functionality, users can now specify to only index or search a subset of the vectors in the data set. This is useful for indices that require training that may only need a subset of the data set for training. Signed-off-by: John Mazanec <jmazane@amazon.com>
opensearch-project · Jan 27, 2022 · 76ec5cd · 76ec5cd
1 parent 6e859f5
commit 76ec5cd
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 14 deletions.
diff --git a/benchmarks/perf-tool/README.md b/benchmarks/perf-tool/README.md
@@ -219,8 +219,9 @@ Ingests a dataset of vectors into the cluster.
 | index_name | Name of index to ingest into | No default |
 | field_name | Name of field to ingest into | No default |
 | bulk_size | Documents per bulk request | 300 |
-| dataset_format | Format the dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
-| dataset_path | Path to dataset | No default |
+| dataset_format | Format the data-set is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
+| dataset_path | Path to data-set | No default |
+| doc_count | Number of documents to create from data-set | Size of the data-set |
 
 ##### Metrics
 
@@ -245,6 +246,7 @@ Runs a set of queries against an index.
 | dataset_path | Path to dataset | No default |
 | neighbors_format | Format the neighbors dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
 | neighbors_path | Path to neighbors dataset | No default |
+| query_count | Number of queries to create from data-set | Size of the data-set |
 
 ##### Metrics
 

diff --git a/benchmarks/perf-tool/okpt/test/steps/steps.py b/benchmarks/perf-tool/okpt/test/steps/steps.py
@@ -278,21 +278,23 @@ def __init__(self, step_config: StepConfig):
         self.dataset = parse_dataset(dataset_format, dataset_path,
                                      Context.INDEX)
 
+        input_doc_count = parse_int_param('doc_count', step_config.config, {},
+                                          self.dataset.size())
+        self.doc_count = min(input_doc_count, self.dataset.size())
+
     def _action(self):
 
         def action(doc_id):
             return {'index': {'_index': self.index_name, '_id': doc_id}}
 
-        id = 0
         index_responses = []
-        while True:
+        for i in range(0, self.doc_count, self.bulk_size):
             partition = self.dataset.read(self.bulk_size)
             if partition is None:
                 break
-            body = bulk_transform(partition, self.field_name, action, id)
+            body = bulk_transform(partition, self.field_name, action, i)
             result = bulk_index(self.opensearch, self.index_name, body)
             index_responses.append(result)
-            id += self.bulk_size
 
         self.dataset.reset()
 
@@ -324,6 +326,11 @@ def __init__(self, step_config: StepConfig):
         self.dataset = parse_dataset(dataset_format, dataset_path,
                                      Context.QUERY)
 
+        input_query_count = parse_int_param('query_count',
+                                            step_config.config, {},
+                                            self.dataset.size())
+        self.query_count = min(input_query_count, self.dataset.size())
+
         neighbors_format = parse_string_param('neighbors_format',
                                               step_config.config, {}, 'hdf5')
         neighbors_path = parse_string_param('neighbors_path',
@@ -349,7 +356,7 @@ def get_body(vec):
 
         results = {}
         query_responses = []
-        while True:
+        for _ in range(self.query_count):
             query = self.dataset.read(1)
             if query is None:
                 break
@@ -367,10 +374,10 @@ def get_body(vec):
                     for hit in query_response['hits']['hits']]
                    for query_response in query_responses]
             results['recall@K'] = recall_at_r(ids, self.neighbors,
-                                              self.k, self.k)
+                                              self.k, self.k, self.query_count)
             self.neighbors.reset()
             results[f'recall@{str(self.r)}'] = recall_at_r(
-                ids, self.neighbors, self.r, self.k)
+                ids, self.neighbors, self.r, self.k, self.query_count)
             self.neighbors.reset()
 
         self.dataset.reset()
@@ -473,7 +480,7 @@ def get_opensearch_client(endpoint: str, port: int):
     )
 
 
-def recall_at_r(results, neighbor_dataset, r, k):
+def recall_at_r(results, neighbor_dataset, r, k, query_count):
     """
     Calculates the recall@R for a set of queries against a ground truth nearest
     neighbor set
@@ -486,22 +493,21 @@ def recall_at_r(results, neighbor_dataset, r, k):
         r: number of top results to check if they are in the ground truth k-NN
         set.
         k: k value for the query
+        query_count: number of queries
     Returns:
         Recall at R
     """
     correct = 0.0
-    query = 0
-    while True:
+    for query in range(query_count):
         true_neighbors = neighbor_dataset.read(1)
         if true_neighbors is None:
             break
         true_neighbors_set = set(true_neighbors[0][:k])
         for j in range(r):
             if results[query][j] in true_neighbors_set:
                 correct += 1.0
-        query += 1
 
-    return correct / (r * neighbor_dataset.size())
+    return correct / (r * query_count)
 
 
 def get_index_size_in_kb(opensearch, index_name):