-
Notifications
You must be signed in to change notification settings - Fork 289
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: override existing results #1617
Changes from 7 commits
e51a04c
e5c19de
30e7443
9634f5e
15aa6a5
182338b
e1e058a
1ec1d1c
3941a04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,7 +63,6 @@ def test_one_missing_split(model, tasks, tmp_path): | |
eval_splits=["val", "test"], | ||
output_folder=str(tmp_path / "testcase2"), | ||
verbosity=2, | ||
overwrite_results=True, | ||
) | ||
|
||
assert "MockRetrievalTask" == results2[0].task_name | ||
|
@@ -93,12 +92,10 @@ def test_no_missing_splits(model, tasks, tmp_path): | |
eval_splits=["val", "test"], | ||
output_folder=str(tmp_path / "testcase3"), | ||
verbosity=2, | ||
overwrite_results=True, | ||
) | ||
|
||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert "MockRetrievalTask" in last_evaluated_splits | ||
assert len(last_evaluated_splits["MockRetrievalTask"]) == 0 | ||
assert len(last_evaluated_splits) == 0 | ||
assert results[0].scores.keys() == {"test", "val"} | ||
|
||
|
||
|
@@ -144,7 +141,6 @@ def test_missing_language(model, multilingual_tasks, tmp_path): | |
output_folder=str(tmp_path / "missing_lang_test"), | ||
verbosity=2, | ||
eval_subsets=["eng", "fra"], | ||
overwrite_results=True, | ||
) | ||
|
||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
|
@@ -178,11 +174,9 @@ def test_no_missing_languages(model, multilingual_tasks, tmp_path): | |
output_folder=str(tmp_path / "no_missing_lang_test"), | ||
verbosity=2, | ||
eval_subsets=["eng", "fra"], | ||
overwrite_results=True, | ||
) | ||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert "MockMultilingualRetrievalTask" in last_evaluated_splits | ||
assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 0 | ||
assert len(last_evaluated_splits) == 0 | ||
assert results[0].scores.keys() == {"test"} | ||
assert len(results[0].scores["test"]) == 2 | ||
assert sorted(results[0].languages) == ["eng", "fra"] | ||
|
@@ -210,7 +204,6 @@ def test_partial_languages(model, multilingual_tasks, tmp_path): | |
output_folder=str(tmp_path / "partial_lang_test"), | ||
verbosity=2, | ||
eval_subsets=["fra", "eng"], | ||
overwrite_results=True, | ||
) | ||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 | ||
|
@@ -245,7 +238,6 @@ def test_multilingual_one_missing_split_no_missing_lang( | |
output_folder=str(tmp_path / "partial_langs_partial_splits"), | ||
verbosity=2, | ||
eval_subsets=["eng", "fra"], | ||
overwrite_results=True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should not affect how missing splits or langs are selected. |
||
) | ||
|
||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
|
@@ -280,7 +272,6 @@ def test_multilingual_one_missing_lang_in_one_split( | |
output_folder=str(tmp_path / "one_lang_one_split"), | ||
verbosity=2, | ||
eval_subsets=["eng"], | ||
overwrite_results=True, | ||
) | ||
|
||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
|
@@ -296,7 +287,6 @@ def test_multilingual_one_missing_lang_in_one_split( | |
output_folder=str(tmp_path / "one_lang_one_split"), | ||
verbosity=2, | ||
eval_subsets=["eng", "fra"], | ||
overwrite_results=True, | ||
) | ||
|
||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
|
@@ -305,3 +295,70 @@ def test_multilingual_one_missing_lang_in_one_split( | |
# output merged result with previous results | ||
assert results[0].scores.keys() == {"test", "val"} | ||
assert len(results[0].scores["test"]) == 2 | ||
|
||
|
||
def test_all_splits_evaluated_with_overwrite(model, tasks, tmp_path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does "all splits" mean here? There are test and val but only val is run here.
Based on this comment, is it correct that "all" means all specified splits in the run method, and not all splits available in the task metadata's eval_splits? Perhaps this can be spelled out, or preferably shown in additional test cases, where the 1st and 2nd runs have different There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, all splits specified in |
||
evaluation = MTEB(tasks=tasks) | ||
results = evaluation.run( | ||
model, | ||
eval_splits=["val"], | ||
output_folder=str(tmp_path / "all_splits_evaluated_with_overwrite"), | ||
verbosity=2, | ||
) | ||
|
||
assert "MockRetrievalTask" == results[0].task_name | ||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert set(last_evaluated_splits["MockRetrievalTask"]) == {"val"} | ||
assert len(last_evaluated_splits["MockRetrievalTask"]) == 1 | ||
assert results[0].scores.keys() == {"val"} | ||
|
||
results2 = evaluation.run( | ||
model, | ||
eval_splits=["val"], | ||
output_folder=str(tmp_path / "all_splits_evaluated_with_overwrite"), | ||
verbosity=2, | ||
overwrite_results=True, | ||
) | ||
assert "MockRetrievalTask" == results2[0].task_name | ||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert set(last_evaluated_splits["MockRetrievalTask"]) == {"val"} | ||
assert len(last_evaluated_splits["MockRetrievalTask"]) == 1 | ||
Samoed marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert results2[0].scores.keys() == {"val"} | ||
|
||
|
||
def test_all_splits_subsets_evaluated_with_overwrite( | ||
model, multilingual_tasks, tmp_path | ||
): | ||
evaluation = MTEB(tasks=multilingual_tasks) | ||
results = evaluation.run( | ||
model, | ||
eval_splits=[ | ||
"test", | ||
], | ||
output_folder=str(tmp_path / "all_splits_subsets_evaluated_with_overwrite"), | ||
verbosity=2, | ||
eval_subsets=["fra"], | ||
) | ||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert "MockMultilingualRetrievalTask" in last_evaluated_splits | ||
assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 | ||
assert results[0].scores.keys() == {"test"} | ||
for split in ["test"]: | ||
assert len(results[0].scores[split]) == 1 | ||
assert sorted(results[0].languages) == ["fra"] | ||
|
||
results2 = evaluation.run( | ||
model, | ||
eval_splits=["test"], | ||
output_folder=str(tmp_path / "all_splits_subsets_evaluated_with_overwrite"), | ||
verbosity=2, | ||
eval_subsets=["fra"], | ||
overwrite_results=True, | ||
) | ||
last_evaluated_splits = evaluation.get_last_evaluated_splits() | ||
assert "MockMultilingualRetrievalTask" in last_evaluated_splits | ||
assert len(last_evaluated_splits["MockMultilingualRetrievalTask"]) == 1 | ||
assert results2[0].scores.keys() == {"test"} | ||
for split in ["test"]: | ||
assert len(results2[0].scores[split]) == 1 | ||
assert sorted(results2[0].languages) == ["fra"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if
existing_results=None
here, then won'tmissing_evaluations
contain all specified splits? Maybe this call can be omitted or simplified to an assignment?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If both
task_eval_splits
andtask_subsets
areNone
, all splits will be selected, but filtering for splits and subsets is still necessary. I think calling the same function is easier to understand than adding more complex logic.