Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make doc and query count configurable in benchmark #270

Merged
merged 3 commits into from
Jan 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions benchmarks/perf-tool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,9 @@ Ingests a dataset of vectors into the cluster.
| index_name | Name of index to ingest into | No default |
| field_name | Name of field to ingest into | No default |
| bulk_size | Documents per bulk request | 300 |
| dataset_format | Format the dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
| dataset_path | Path to dataset | No default |
| dataset_format | Format the data-set is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
| dataset_path | Path to data-set | No default |
| doc_count | Number of documents to create from data-set | Size of the data-set |

##### Metrics

Expand All @@ -245,6 +246,7 @@ Runs a set of queries against an index.
| dataset_path | Path to dataset | No default |
| neighbors_format | Format the neighbors dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' |
| neighbors_path | Path to neighbors dataset | No default |
| query_count | Number of queries to create from data-set | Size of the data-set |

##### Metrics

Expand Down
30 changes: 18 additions & 12 deletions benchmarks/perf-tool/okpt/test/steps/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,21 +278,23 @@ def __init__(self, step_config: StepConfig):
self.dataset = parse_dataset(dataset_format, dataset_path,
Context.INDEX)

input_doc_count = parse_int_param('doc_count', step_config.config, {},
self.dataset.size())
self.doc_count = min(input_doc_count, self.dataset.size())

def _action(self):

def action(doc_id):
return {'index': {'_index': self.index_name, '_id': doc_id}}

id = 0
index_responses = []
while True:
for i in range(0, self.doc_count, self.bulk_size):
partition = self.dataset.read(self.bulk_size)
if partition is None:
break
body = bulk_transform(partition, self.field_name, action, id)
body = bulk_transform(partition, self.field_name, action, i)
result = bulk_index(self.opensearch, self.index_name, body)
index_responses.append(result)
id += self.bulk_size

self.dataset.reset()

Expand Down Expand Up @@ -324,6 +326,11 @@ def __init__(self, step_config: StepConfig):
self.dataset = parse_dataset(dataset_format, dataset_path,
Context.QUERY)

input_query_count = parse_int_param('query_count',
step_config.config, {},
self.dataset.size())
self.query_count = min(input_query_count, self.dataset.size())

neighbors_format = parse_string_param('neighbors_format',
step_config.config, {}, 'hdf5')
neighbors_path = parse_string_param('neighbors_path',
Expand All @@ -349,7 +356,7 @@ def get_body(vec):

results = {}
query_responses = []
while True:
for _ in range(self.query_count):
query = self.dataset.read(1)
if query is None:
break
Expand All @@ -367,10 +374,10 @@ def get_body(vec):
for hit in query_response['hits']['hits']]
for query_response in query_responses]
results['recall@K'] = recall_at_r(ids, self.neighbors,
self.k, self.k)
self.k, self.k, self.query_count)
self.neighbors.reset()
results[f'recall@{str(self.r)}'] = recall_at_r(
ids, self.neighbors, self.r, self.k)
ids, self.neighbors, self.r, self.k, self.query_count)
self.neighbors.reset()

self.dataset.reset()
Expand Down Expand Up @@ -473,7 +480,7 @@ def get_opensearch_client(endpoint: str, port: int):
)


def recall_at_r(results, neighbor_dataset, r, k):
def recall_at_r(results, neighbor_dataset, r, k, query_count):
"""
Calculates the recall@R for a set of queries against a ground truth nearest
neighbor set
Expand All @@ -486,22 +493,21 @@ def recall_at_r(results, neighbor_dataset, r, k):
r: number of top results to check if they are in the ground truth k-NN
set.
k: k value for the query
query_count: number of queries
Returns:
Recall at R
"""
correct = 0.0
query = 0
while True:
for query in range(query_count):
true_neighbors = neighbor_dataset.read(1)
if true_neighbors is None:
break
true_neighbors_set = set(true_neighbors[0][:k])
for j in range(r):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FAR: may be change r to limit or something meaningful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

r is a technical term in recall@r. I calculate it now as the fraction of (# of top r results returned by the query are in the ground truth k set) / r.

However, I think I may have this mixed up a little bit and I will need to refactor this to follow how faiss computes it: https://github.com/facebookresearch/faiss/blob/main/faiss/AutoTune.cpp#L60-L97. I will make a separate issue for this.

if results[query][j] in true_neighbors_set:
correct += 1.0
query += 1

return correct / (r * neighbor_dataset.size())
return correct / (r * query_count)


def get_index_size_in_kb(opensearch, index_name):
Expand Down